- New video card block for GrapesJS landing pages, email templates, MkDocs export, and documentation editor Insert dropdown - Shared HTML generators in admin/src/utils/videoCardHtml.ts - MkDocs video-player.js hydrates .video-card-block elements: thumbnail fix via MEDIA_API_URL, click-to-play inline, Gallery link - Media API CORS: auto-add MkDocs + docs subdomain origins - env_config_hook.py: smart Docker hostname detection, ADMIN_PORT resolution, pass env vars to MkDocs container - Gallery URL uses /gallery?expanded=ID format - VideoPickerModal: fix double /api prefix and Docker hostname thumbs - Seed: default-video-card PageBlock - Remove V1 legacy code (influence/, map/) Bunker Admin
216 lines
7.5 KiB
YAML
216 lines
7.5 KiB
YAML
groups:
|
|
- name: v2_app_alerts
|
|
interval: 30s
|
|
rules:
|
|
# Application availability
|
|
- alert: ApplicationDown
|
|
expr: up{job="changemaker-v2-api"} == 0
|
|
for: 2m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "V2 API is down"
|
|
description: "The Changemaker V2 API has been down for more than 2 minutes."
|
|
|
|
# High error rate
|
|
- alert: HighErrorRate
|
|
expr: rate(http_requests_total{status_code=~"5.."}[5m]) > 0.1
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "High error rate detected"
|
|
description: "Application is experiencing {{ $value }} errors per second."
|
|
|
|
# Email queue backing up
|
|
- alert: EmailQueueBacklog
|
|
expr: cm_email_queue_size > 100
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Email queue has significant backlog"
|
|
description: "Email queue size is {{ $value }}, emails may be delayed."
|
|
|
|
# High email failure rate
|
|
- alert: HighEmailFailureRate
|
|
expr: rate(cm_emails_failed_total[5m]) / rate(cm_emails_sent_total[5m]) > 0.2
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "High email failure rate"
|
|
description: "{{ $value | humanizePercentage }} of emails are failing to send."
|
|
|
|
# Failed login attempts spike
|
|
- alert: SuspiciousLoginActivity
|
|
expr: rate(cm_login_attempts_total{status="failure"}[5m]) > 5
|
|
for: 2m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Suspicious login activity detected"
|
|
description: "{{ $value }} failed login attempts per second detected."
|
|
|
|
# High API latency
|
|
- alert: HighAPILatency
|
|
expr: histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m])) > 2
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "High API latency"
|
|
description: "95th percentile latency is {{ $value }}s for {{ $labels.route }}."
|
|
|
|
# External service down
|
|
- alert: ExternalServiceDown
|
|
expr: cm_external_service_up == 0
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "External service {{ $labels.service }} is down"
|
|
description: "Service {{ $labels.service }} has been unreachable for 5 minutes."
|
|
|
|
# System health alerts
|
|
- name: system_alerts
|
|
interval: 30s
|
|
rules:
|
|
# Redis down
|
|
- alert: RedisDown
|
|
expr: redis_up == 0
|
|
for: 1m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Redis cache is down"
|
|
description: "Redis has been down for more than 1 minute. Caching and session management will fail."
|
|
|
|
# Disk space running low
|
|
- alert: DiskSpaceLow
|
|
expr: (node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"}) < 0.15
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Disk space is running low"
|
|
description: "Only {{ $value | humanizePercentage }} disk space remaining on root filesystem."
|
|
|
|
# Disk space critical
|
|
- alert: DiskSpaceCritical
|
|
expr: (node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"}) < 0.10
|
|
for: 2m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "CRITICAL: Disk space nearly exhausted"
|
|
description: "Only {{ $value | humanizePercentage }} disk space remaining! System may fail soon."
|
|
|
|
# High CPU usage
|
|
- alert: HighCPUUsage
|
|
expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 85
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "High CPU usage detected"
|
|
description: "CPU usage is {{ $value }}% on {{ $labels.instance }}."
|
|
|
|
# Memory usage high
|
|
- alert: HighMemoryUsage
|
|
expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) > 0.85
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "High memory usage"
|
|
description: "Memory usage is above 85% ({{ $value | humanizePercentage }})."
|
|
|
|
# Container CPU throttling (only Docker containers)
|
|
- alert: ContainerCPUThrottling
|
|
expr: rate(container_cpu_cfs_throttled_seconds_total{name!=""}[5m]) > 0.5
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Container is being CPU throttled"
|
|
description: "Container {{ $labels.name }} is experiencing CPU throttling."
|
|
|
|
# Container memory usage high (only Docker containers with memory limits)
|
|
- alert: ContainerMemoryHigh
|
|
expr: (container_memory_usage_bytes{name!=""} / container_spec_memory_limit_bytes{name!=""}) > 0.90 and container_spec_memory_limit_bytes{name!=""} > 0
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Container memory usage is high"
|
|
description: "Container {{ $labels.name }} is using {{ $value | humanizePercentage }} of its memory limit."
|
|
|
|
# Infrastructure alerts
|
|
- name: infrastructure_alerts
|
|
interval: 30s
|
|
rules:
|
|
# Prometheus scrape failures
|
|
- alert: PrometheusScrapeFailures
|
|
expr: rate(prometheus_target_scrapes_failed_total[5m]) > 0.1
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Prometheus scrape failures detected"
|
|
description: "Prometheus is failing to scrape {{ $labels.job }} target."
|
|
|
|
# Prometheus configuration reload failure
|
|
- alert: PrometheusConfigReloadFailed
|
|
expr: prometheus_config_last_reload_successful == 0
|
|
for: 1m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Prometheus configuration reload failed"
|
|
description: "Prometheus failed to reload its configuration. Check prometheus logs."
|
|
|
|
# Alertmanager down
|
|
- alert: AlertmanagerDown
|
|
expr: up{job="alertmanager"} == 0
|
|
for: 2m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Alertmanager is down"
|
|
description: "Alertmanager has been down for 2 minutes. Alerts will not be delivered!"
|
|
|
|
# Security alerts
|
|
- name: security_alerts
|
|
interval: 15s
|
|
rules:
|
|
# Possible DDoS attack
|
|
- alert: PossibleDDoSAttack
|
|
expr: rate(http_requests_total[1m]) > 1000
|
|
for: 2m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Possible DDoS attack detected"
|
|
description: "Receiving {{ $value }} requests per second for 2 minutes. This may be a DDoS attack."
|
|
|
|
# Sustained high traffic
|
|
- alert: SustainedHighTraffic
|
|
expr: rate(http_requests_total[5m]) > 500
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Sustained high traffic detected"
|
|
description: "Receiving {{ $value }} requests per second for 10 minutes. Monitor for performance issues."
|
|
|
|
# Too many 4xx errors
|
|
- alert: HighClientErrorRate
|
|
expr: rate(http_requests_total{status_code=~"4.."}[5m]) > 5
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "High rate of 4xx client errors"
|
|
description: "Receiving {{ $value }} client errors per second. Check for broken links or API misuse."
|