update notification_events.py

This commit is contained in:
MacRimi
2026-04-09 12:34:03 +02:00
parent d8631a8594
commit 2b8caa924f
6 changed files with 143 additions and 21 deletions

View File

@@ -29,6 +29,17 @@ export default function Home() {
const response = await fetch(getApiUrl("/api/auth/status"), {
headers: token ? { Authorization: `Bearer ${token}` } : {},
})
// Check if response is valid JSON before parsing
if (!response.ok) {
throw new Error(`HTTP ${response.status}`)
}
const contentType = response.headers.get("content-type")
if (!contentType || !contentType.includes("application/json")) {
throw new Error("Response is not JSON")
}
const data = await response.json()
const authenticated = data.auth_enabled ? data.authenticated : true
@@ -39,8 +50,8 @@ export default function Home() {
authConfigured: data.auth_configured,
authenticated,
})
} catch (error) {
console.error("Failed to check auth status:", error)
} catch {
// API not available - assume no auth configured (silent fail, no console error)
setAuthStatus({
loading: false,
authEnabled: false,

View File

@@ -27,18 +27,26 @@ export function AuthSetup({ onComplete }: AuthSetupProps) {
const checkOnboardingStatus = async () => {
try {
const response = await fetch(getApiUrl("/api/auth/status"))
// Check if response is valid JSON before parsing
if (!response.ok) {
// API not available - don't show modal in preview
return
}
const contentType = response.headers.get("content-type")
if (!contentType || !contentType.includes("application/json")) {
return
}
const data = await response.json()
console.log("[v0] Auth status for modal check:", data)
// Show modal if auth is not configured and not declined
if (!data.auth_configured) {
setTimeout(() => setOpen(true), 500)
}
} catch (error) {
console.error("[v0] Failed to check auth status:", error)
// Fail-safe: show modal if we can't check status
setTimeout(() => setOpen(true), 500)
} catch {
// API not available (preview environment) - don't show modal
}
}

View File

@@ -299,6 +299,19 @@ export function NotificationSettings() {
fallback_commands: string[]
error: string
}>({ status: "idle", fallback_commands: [], error: "" })
const [systemHostname, setSystemHostname] = useState<string>("")
// Load system hostname for display name placeholder
const loadSystemHostname = useCallback(async () => {
try {
const data = await fetchApi<{ hostname?: string }>("/api/system")
if (data.hostname) {
setSystemHostname(data.hostname)
}
} catch {
// Ignore - will show generic placeholder
}
}, [])
const loadConfig = useCallback(async () => {
try {
@@ -366,7 +379,8 @@ export function NotificationSettings() {
useEffect(() => {
loadConfig()
loadStatus()
}, [loadConfig, loadStatus])
loadSystemHostname()
}, [loadConfig, loadStatus, loadSystemHostname])
useEffect(() => {
if (showHistory) loadHistory()
@@ -1505,6 +1519,25 @@ export function NotificationSettings() {
</div>{/* close bordered channel container */}
</div>
{/* ── Display Name ── */}
<div className="space-y-2 pb-3 border-b border-border/50">
<div className="flex items-center gap-2">
<Server className="h-4 w-4 text-blue-400" />
<Label className="text-xs sm:text-sm text-foreground/80">Display Name</Label>
</div>
<Input
className={`h-9 text-sm ${!editMode ? "opacity-50 cursor-not-allowed" : ""}`}
placeholder={systemHostname || "System hostname"}
value={config.hostname || (editMode ? "" : systemHostname)}
onChange={e => updateConfig(p => ({ ...p, hostname: e.target.value }))}
disabled={!editMode}
readOnly={!editMode}
/>
<p className="text-xs text-muted-foreground">
Name shown in notifications. Edit to customize, or leave empty to use the system hostname.
</p>
</div>
{/* ── Advanced: AI Enhancement ── */}
<div>
<div className="flex items-center justify-between py-1">

View File

@@ -641,11 +641,18 @@ export function Security() {
const checkAuthStatus = async () => {
try {
const response = await fetch(getApiUrl("/api/auth/status"))
// Check if response is valid JSON before parsing
if (!response.ok) return
const contentType = response.headers.get("content-type")
if (!contentType || !contentType.includes("application/json")) return
const data = await response.json()
setAuthEnabled(data.auth_enabled || false)
setTotpEnabled(data.totp_enabled || false)
} catch (err) {
console.error("Failed to check auth status:", err)
} catch {
// API not available (preview environment)
}
}

View File

@@ -181,6 +181,20 @@ class HealthMonitor:
# not a system problem.
r'pvescheduler.*could not update job state',
r'pvescheduler.*no such task',
# ── GPU passthrough / vfio operational noise ──
# When a GPU is passed through to a VM using vfio-pci, the host
# NVIDIA driver will log errors because it cannot access the GPU.
# This is expected behavior, NOT an error - the passthrough is working.
r'NVRM.*GPU.*already bound to vfio-pci',
r'NVRM.*GPU.*is not supported',
r'NVRM.*failed to enable MSI',
r'NVRM.*RmInitAdapter failed',
r'NVRM.*rm_init_adapter failed',
r'nvidia.*probe.*failed',
r'vfio-pci.*\d+:\d+:\d+\.\d+.*reset',
r'vfio-pci.*enabling device',
r'vfio_pci.*cannot assign irq',
]
CRITICAL_LOG_KEYWORDS = [
@@ -745,7 +759,13 @@ class HealthMonitor:
}
def _check_cpu_with_hysteresis(self) -> Dict[str, Any]:
"""Check CPU with hysteresis to avoid flapping alerts - requires 5min sustained high usage"""
"""Check CPU with hysteresis to avoid flapping alerts - requires sustained high usage.
With samples every ~10 seconds:
- CRITICAL: 30 samples >= 95% in 300s window = 5 min sustained
- WARNING: 30 samples >= 85% in 300s window = 5 min sustained
- RECOVERY: 12 samples < 75% in 120s window = 2 min below threshold
"""
try:
cpu_percent = psutil.cpu_percent(interval=0.1) # 100ms sample - sufficient for health check
current_time = time.time()
@@ -765,6 +785,7 @@ class HealthMonitor:
if current_time - entry['time'] < 360
]
# Count samples in the monitoring windows
critical_samples = [
entry for entry in self.state_history[state_key]
if entry['value'] >= self.CPU_CRITICAL and
@@ -783,27 +804,39 @@ class HealthMonitor:
current_time - entry['time'] <= self.CPU_RECOVERY_DURATION
]
if len(critical_samples) >= 3:
# Require enough samples to cover the sustained period
# With ~10s sampling interval: 300s = ~30 samples, 120s = ~12 samples
# Using slightly lower thresholds to account for timing variations
CRITICAL_MIN_SAMPLES = 25 # ~250s of sustained high CPU
WARNING_MIN_SAMPLES = 25 # ~250s of sustained elevated CPU
RECOVERY_MIN_SAMPLES = 10 # ~100s of recovery
if len(critical_samples) >= CRITICAL_MIN_SAMPLES:
# Calculate actual duration from oldest to newest sample
oldest = min(s['time'] for s in critical_samples)
actual_duration = int(current_time - oldest)
status = 'CRITICAL'
reason = f'CPU >{self.CPU_CRITICAL}% sustained for {self.CPU_CRITICAL_DURATION}s'
reason = f'CPU >{self.CPU_CRITICAL}% sustained for {actual_duration}s'
# Record the error
health_persistence.record_error(
error_key='cpu_usage',
category='cpu',
severity='CRITICAL',
reason=reason,
details={'cpu_percent': cpu_percent}
details={'cpu_percent': cpu_percent, 'duration': actual_duration}
)
elif len(warning_samples) >= 3 and len(recovery_samples) < 2:
elif len(warning_samples) >= WARNING_MIN_SAMPLES and len(recovery_samples) < RECOVERY_MIN_SAMPLES:
oldest = min(s['time'] for s in warning_samples)
actual_duration = int(current_time - oldest)
status = 'WARNING'
reason = f'CPU >{self.CPU_WARNING}% sustained for {self.CPU_WARNING_DURATION}s'
reason = f'CPU >{self.CPU_WARNING}% sustained for {actual_duration}s'
# Record the warning
health_persistence.record_error(
error_key='cpu_usage',
category='cpu',
severity='WARNING',
reason=reason,
details={'cpu_percent': cpu_percent}
details={'cpu_percent': cpu_percent, 'duration': actual_duration}
)
else:
status = 'OK'
@@ -921,9 +954,15 @@ class HealthMonitor:
# Require at least 18 samples over 3 minutes (one every 10 seconds) to trigger alert
if len(high_temp_samples) >= 18:
# Temperature has been >80°C for >3 minutes
# Temperature has been >80°C for >3 minutes - calculate actual duration
oldest = min(s['time'] for s in high_temp_samples)
actual_duration = int(current_time - oldest)
actual_minutes = actual_duration // 60
actual_seconds = actual_duration % 60
duration_str = f'{actual_minutes}m {actual_seconds}s' if actual_minutes > 0 else f'{actual_seconds}s'
status = 'WARNING'
reason = f'CPU temperature {max_temp}°C >80°C sustained >3min'
reason = f'CPU temperature {max_temp}°C >80°C sustained for {duration_str}'
# Record non-dismissable error
health_persistence.record_error(
@@ -931,7 +970,7 @@ class HealthMonitor:
category='temperature',
severity='WARNING',
reason=reason,
details={'temperature': max_temp, 'dismissable': False}
details={'temperature': max_temp, 'duration': actual_duration, 'dismissable': False}
)
elif len(recovery_samples) >= 3:
# Temperature has been ≤80°C for 30 seconds - clear the error

View File

@@ -137,6 +137,30 @@ class NotificationEvent:
def _hostname() -> str:
"""Get display hostname for notifications.
Returns the custom display name from notification settings if configured,
otherwise falls back to the system hostname.
"""
# Try to read custom display name from notification settings
try:
db_path = Path('/usr/local/share/proxmenux/health_monitor.db')
if db_path.exists():
conn = sqlite3.connect(str(db_path), timeout=5)
conn.execute('PRAGMA busy_timeout=3000')
cursor = conn.cursor()
cursor.execute(
"SELECT setting_value FROM user_settings WHERE setting_key = ?",
('notification.hostname',)
)
row = cursor.fetchone()
conn.close()
if row and row[0] and row[0].strip():
return row[0].strip()
except Exception:
pass # Fall back to system hostname
# Fall back to system hostname
try:
return socket.gethostname().split('.')[0]
except Exception: