Update notification service

This commit is contained in:
MacRimi
2026-02-27 19:47:36 +01:00
parent be119a69af
commit 171e7ddcae
4 changed files with 84 additions and 43 deletions

View File

@@ -324,7 +324,8 @@ class HealthMonitor:
Returns JSON structure with ALL 10 categories always present.
Now includes persistent error tracking.
"""
# Run cleanup on every status check to auto-resolve stale errors
# Run cleanup on every status check so stale errors are auto-resolved
# using the user-configured Suppression Duration (single source of truth).
try:
health_persistence.cleanup_old_errors()
except Exception:
@@ -2157,18 +2158,18 @@ class HealthMonitor:
# Get a representative critical error reason
representative_error = next(iter(critical_errors_found.values()))
reason = f'Critical error detected: {representative_error[:100]}'
elif cascade_count > 0:
status = 'WARNING'
samples = _get_samples(cascading_errors, 3)
reason = f'Error cascade ({cascade_count} patterns repeating):\n' + '\n'.join(f' - {s}' for s in samples)
elif spike_count > 0:
status = 'WARNING'
samples = _get_samples(spike_errors, 3)
reason = f'Error spike ({spike_count} patterns with 4x increase):\n' + '\n'.join(f' - {s}' for s in samples)
elif persistent_count > 0:
status = 'WARNING'
samples = _get_samples(persistent_errors, 3)
reason = f'Persistent errors ({persistent_count} patterns over 15+ min):\n' + '\n'.join(f' - {s}' for s in samples)
elif cascade_count > 0:
status = 'WARNING'
samples = _get_samples(cascading_errors, 3)
reason = f'Error cascade ({cascade_count} patterns repeating):\n' + '\n'.join(f' - {s}' for s in samples)
elif spike_count > 0:
status = 'WARNING'
samples = _get_samples(spike_errors, 3)
reason = f'Error spike ({spike_count} patterns with 4x increase):\n' + '\n'.join(f' - {s}' for s in samples)
elif persistent_count > 0:
status = 'WARNING'
samples = _get_samples(persistent_errors, 3)
reason = f'Persistent errors ({persistent_count} patterns over 15+ min):\n' + '\n'.join(f' - {s}' for s in samples)
else:
# No significant issues found
status = 'OK'
@@ -2189,23 +2190,23 @@ class HealthMonitor:
'log_critical_errors': {'active': unique_critical_count > 0, 'severity': 'CRITICAL',
'reason': f'{unique_critical_count} critical error(s) found', 'dismissable': False},
}
# Track which sub-checks were dismissed
dismissed_keys = set()
for err_key, info in log_sub_checks.items():
if info['active']:
is_dismissable = info.get('dismissable', True)
result = health_persistence.record_error(
error_key=err_key,
category='logs',
severity=info['severity'],
reason=info['reason'],
details={'dismissable': is_dismissable}
)
if result and result.get('type') == 'skipped_acknowledged':
dismissed_keys.add(err_key)
elif health_persistence.is_error_active(err_key):
health_persistence.clear_error(err_key)
# Track which sub-checks were dismissed
dismissed_keys = set()
for err_key, info in log_sub_checks.items():
if info['active']:
is_dismissable = info.get('dismissable', True)
result = health_persistence.record_error(
error_key=err_key,
category='logs',
severity=info['severity'],
reason=info['reason'],
details={'dismissable': is_dismissable}
)
if result and result.get('type') == 'skipped_acknowledged':
dismissed_keys.add(err_key)
elif health_persistence.is_error_active(err_key):
health_persistence.clear_error(err_key)
# Build checks dict - downgrade dismissed items to INFO
def _log_check_status(key, active, severity):

View File

@@ -26,7 +26,7 @@ class HealthPersistence:
"""Manages persistent health error tracking"""
# Default suppression duration when no user setting exists for a category.
# Users can override per-category via the Suppression Duration settings.
# Users override per-category via the Suppression Duration settings UI.
DEFAULT_SUPPRESSION_HOURS = 24
# Mapping from error categories to settings keys
@@ -498,13 +498,16 @@ class HealthPersistence:
cutoff_resolved = (now - timedelta(days=7)).isoformat()
cursor.execute('DELETE FROM errors WHERE resolved_at < ?', (cutoff_resolved,))
# ── Auto-resolve stale errors using user-configured Suppression Duration ──
# Read the per-category suppression hours from user_settings.
# If the user hasn't configured a category, fall back to DEFAULT_SUPPRESSION_HOURS.
# ── Auto-resolve stale errors using Suppression Duration settings ──
# Read per-category suppression hours from user_settings.
# If the user hasn't configured a value, use DEFAULT_SUPPRESSION_HOURS.
# This is the SINGLE source of truth for auto-resolution timing.
user_settings = {}
try:
cursor.execute('SELECT setting_key, setting_value FROM user_settings WHERE setting_key LIKE ?', ('suppress_%',))
cursor.execute(
'SELECT setting_key, setting_value FROM user_settings WHERE setting_key LIKE ?',
('suppress_%',)
)
for row in cursor.fetchall():
user_settings[row[0]] = row[1]
except Exception:
@@ -517,6 +520,10 @@ class HealthPersistence:
except (ValueError, TypeError):
hours = self.DEFAULT_SUPPRESSION_HOURS
# -1 means permanently suppressed -- skip auto-resolve
if hours < 0:
continue
cutoff = (now - timedelta(hours=hours)).isoformat()
cursor.execute('''
UPDATE errors
@@ -527,7 +534,7 @@ class HealthPersistence:
AND acknowledged = 0
''', (now_iso, category, cutoff))
# Catch-all: auto-resolve ANY error from an unmapped category
# Catch-all: auto-resolve any error from an unmapped category
# whose last_seen exceeds DEFAULT_SUPPRESSION_HOURS.
fallback_cutoff = (now - timedelta(hours=self.DEFAULT_SUPPRESSION_HOURS)).isoformat()
cursor.execute('''