Closes 12 template drift gaps between the Control Panel templates and production configs. New instances now provision with full monitoring (alerts fire properly), correct Gitea DB type (postgres not mysql), social sharing previews (OG meta bot routes), Excalidraw subdomain routing, docker-socket-proxy for Homepage, and complete Grafana/ Alertmanager/Prometheus config copying. Key changes: - Rewrite Prometheus template: add alerting, rule_files, 5 scrape jobs - Add cAdvisor, node-exporter, redis-exporter, gotify, docker-socket-proxy - Fix Gitea env from mysql to postgres to match docker-compose - Add OG bot detection + rewrite routes for campaigns/pages/gallery - Add Excalidraw nginx server block + Pangolin draw subdomain - Add embed port to discovery portConfig + emailTestMode to registration - Copy alerts.yml, alertmanager.yml, Grafana dashboards to templates - Add Listmonk proxy port and upgrade volume to API service Bunker Admin
216 lines
7.5 KiB
YAML
216 lines
7.5 KiB
YAML
groups:
|
|
- name: v2_app_alerts
|
|
interval: 30s
|
|
rules:
|
|
# Application availability
|
|
- alert: ApplicationDown
|
|
expr: up{job="changemaker-v2-api"} == 0
|
|
for: 2m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "V2 API is down"
|
|
description: "The Changemaker V2 API has been down for more than 2 minutes."
|
|
|
|
# High error rate
|
|
- alert: HighErrorRate
|
|
expr: rate(http_requests_total{status_code=~"5.."}[5m]) > 0.1
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "High error rate detected"
|
|
description: "Application is experiencing {{ $value }} errors per second."
|
|
|
|
# Email queue backing up
|
|
- alert: EmailQueueBacklog
|
|
expr: cm_email_queue_size > 100
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Email queue has significant backlog"
|
|
description: "Email queue size is {{ $value }}, emails may be delayed."
|
|
|
|
# High email failure rate
|
|
- alert: HighEmailFailureRate
|
|
expr: rate(cm_emails_failed_total[5m]) / rate(cm_emails_sent_total[5m]) > 0.2
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "High email failure rate"
|
|
description: "{{ $value | humanizePercentage }} of emails are failing to send."
|
|
|
|
# Failed login attempts spike
|
|
- alert: SuspiciousLoginActivity
|
|
expr: rate(cm_login_attempts_total{status="failure"}[5m]) > 5
|
|
for: 2m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Suspicious login activity detected"
|
|
description: "{{ $value }} failed login attempts per second detected."
|
|
|
|
# High API latency
|
|
- alert: HighAPILatency
|
|
expr: histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m])) > 2
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "High API latency"
|
|
description: "95th percentile latency is {{ $value }}s for {{ $labels.route }}."
|
|
|
|
# External service down
|
|
- alert: ExternalServiceDown
|
|
expr: cm_external_service_up == 0
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "External service {{ $labels.service }} is down"
|
|
description: "Service {{ $labels.service }} has been unreachable for 5 minutes."
|
|
|
|
# System health alerts
|
|
- name: system_alerts
|
|
interval: 30s
|
|
rules:
|
|
# Redis down
|
|
- alert: RedisDown
|
|
expr: redis_up == 0
|
|
for: 1m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Redis cache is down"
|
|
description: "Redis has been down for more than 1 minute. Caching and session management will fail."
|
|
|
|
# Disk space running low
|
|
- alert: DiskSpaceLow
|
|
expr: (node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"}) < 0.15
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Disk space is running low"
|
|
description: "Only {{ $value | humanizePercentage }} disk space remaining on root filesystem."
|
|
|
|
# Disk space critical
|
|
- alert: DiskSpaceCritical
|
|
expr: (node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"}) < 0.10
|
|
for: 2m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "CRITICAL: Disk space nearly exhausted"
|
|
description: "Only {{ $value | humanizePercentage }} disk space remaining! System may fail soon."
|
|
|
|
# High CPU usage
|
|
- alert: HighCPUUsage
|
|
expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 85
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "High CPU usage detected"
|
|
description: "CPU usage is {{ $value }}% on {{ $labels.instance }}."
|
|
|
|
# Memory usage high
|
|
- alert: HighMemoryUsage
|
|
expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) > 0.85
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "High memory usage"
|
|
description: "Memory usage is above 85% ({{ $value | humanizePercentage }})."
|
|
|
|
# Container CPU throttling (only Docker containers)
|
|
- alert: ContainerCPUThrottling
|
|
expr: rate(container_cpu_cfs_throttled_seconds_total{name!=""}[5m]) > 0.5
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Container is being CPU throttled"
|
|
description: "Container {{ $labels.name }} is experiencing CPU throttling."
|
|
|
|
# Container memory usage high (only Docker containers with memory limits)
|
|
- alert: ContainerMemoryHigh
|
|
expr: (container_memory_usage_bytes{name!=""} / container_spec_memory_limit_bytes{name!=""}) > 0.90 and container_spec_memory_limit_bytes{name!=""} > 0
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Container memory usage is high"
|
|
description: "Container {{ $labels.name }} is using {{ $value | humanizePercentage }} of its memory limit."
|
|
|
|
# Infrastructure alerts
|
|
- name: infrastructure_alerts
|
|
interval: 30s
|
|
rules:
|
|
# Prometheus scrape failures
|
|
- alert: PrometheusScrapeFailures
|
|
expr: rate(prometheus_target_scrapes_failed_total[5m]) > 0.1
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Prometheus scrape failures detected"
|
|
description: "Prometheus is failing to scrape {{ $labels.job }} target."
|
|
|
|
# Prometheus configuration reload failure
|
|
- alert: PrometheusConfigReloadFailed
|
|
expr: prometheus_config_last_reload_successful == 0
|
|
for: 1m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Prometheus configuration reload failed"
|
|
description: "Prometheus failed to reload its configuration. Check prometheus logs."
|
|
|
|
# Alertmanager down
|
|
- alert: AlertmanagerDown
|
|
expr: up{job="alertmanager"} == 0
|
|
for: 2m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Alertmanager is down"
|
|
description: "Alertmanager has been down for 2 minutes. Alerts will not be delivered!"
|
|
|
|
# Security alerts
|
|
- name: security_alerts
|
|
interval: 15s
|
|
rules:
|
|
# Possible DDoS attack
|
|
- alert: PossibleDDoSAttack
|
|
expr: rate(http_requests_total[1m]) > 1000
|
|
for: 2m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Possible DDoS attack detected"
|
|
description: "Receiving {{ $value }} requests per second for 2 minutes. This may be a DDoS attack."
|
|
|
|
# Sustained high traffic
|
|
- alert: SustainedHighTraffic
|
|
expr: rate(http_requests_total[5m]) > 500
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Sustained high traffic detected"
|
|
description: "Receiving {{ $value }} requests per second for 10 minutes. Monitor for performance issues."
|
|
|
|
# Too many 4xx errors
|
|
- alert: HighClientErrorRate
|
|
expr: rate(http_requests_total{status_code=~"4.."}[5m]) > 5
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "High rate of 4xx client errors"
|
|
description: "Receiving {{ $value }} client errors per second. Check for broken links or API misuse."
|