bunker-admin 5642a24c8f Sync CCP templates with production configs for complete instance provisioning
Closes 12 template drift gaps between the Control Panel templates and
production configs. New instances now provision with full monitoring
(alerts fire properly), correct Gitea DB type (postgres not mysql),
social sharing previews (OG meta bot routes), Excalidraw subdomain
routing, docker-socket-proxy for Homepage, and complete Grafana/
Alertmanager/Prometheus config copying.

Key changes:
- Rewrite Prometheus template: add alerting, rule_files, 5 scrape jobs
- Add cAdvisor, node-exporter, redis-exporter, gotify, docker-socket-proxy
- Fix Gitea env from mysql to postgres to match docker-compose
- Add OG bot detection + rewrite routes for campaigns/pages/gallery
- Add Excalidraw nginx server block + Pangolin draw subdomain
- Add embed port to discovery portConfig + emailTestMode to registration
- Copy alerts.yml, alertmanager.yml, Grafana dashboards to templates
- Add Listmonk proxy port and upgrade volume to API service

Bunker Admin
2026-03-05 08:32:49 -07:00

216 lines
7.5 KiB
YAML

groups:
- name: v2_app_alerts
interval: 30s
rules:
# Application availability
- alert: ApplicationDown
expr: up{job="changemaker-v2-api"} == 0
for: 2m
labels:
severity: critical
annotations:
summary: "V2 API is down"
description: "The Changemaker V2 API has been down for more than 2 minutes."
# High error rate
- alert: HighErrorRate
expr: rate(http_requests_total{status_code=~"5.."}[5m]) > 0.1
for: 5m
labels:
severity: warning
annotations:
summary: "High error rate detected"
description: "Application is experiencing {{ $value }} errors per second."
# Email queue backing up
- alert: EmailQueueBacklog
expr: cm_email_queue_size > 100
for: 10m
labels:
severity: warning
annotations:
summary: "Email queue has significant backlog"
description: "Email queue size is {{ $value }}, emails may be delayed."
# High email failure rate
- alert: HighEmailFailureRate
expr: rate(cm_emails_failed_total[5m]) / rate(cm_emails_sent_total[5m]) > 0.2
for: 10m
labels:
severity: warning
annotations:
summary: "High email failure rate"
description: "{{ $value | humanizePercentage }} of emails are failing to send."
# Failed login attempts spike
- alert: SuspiciousLoginActivity
expr: rate(cm_login_attempts_total{status="failure"}[5m]) > 5
for: 2m
labels:
severity: warning
annotations:
summary: "Suspicious login activity detected"
description: "{{ $value }} failed login attempts per second detected."
# High API latency
- alert: HighAPILatency
expr: histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m])) > 2
for: 5m
labels:
severity: warning
annotations:
summary: "High API latency"
description: "95th percentile latency is {{ $value }}s for {{ $labels.route }}."
# External service down
- alert: ExternalServiceDown
expr: cm_external_service_up == 0
for: 5m
labels:
severity: warning
annotations:
summary: "External service {{ $labels.service }} is down"
description: "Service {{ $labels.service }} has been unreachable for 5 minutes."
# System health alerts
- name: system_alerts
interval: 30s
rules:
# Redis down
- alert: RedisDown
expr: redis_up == 0
for: 1m
labels:
severity: critical
annotations:
summary: "Redis cache is down"
description: "Redis has been down for more than 1 minute. Caching and session management will fail."
# Disk space running low
- alert: DiskSpaceLow
expr: (node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"}) < 0.15
for: 5m
labels:
severity: warning
annotations:
summary: "Disk space is running low"
description: "Only {{ $value | humanizePercentage }} disk space remaining on root filesystem."
# Disk space critical
- alert: DiskSpaceCritical
expr: (node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"}) < 0.10
for: 2m
labels:
severity: critical
annotations:
summary: "CRITICAL: Disk space nearly exhausted"
description: "Only {{ $value | humanizePercentage }} disk space remaining! System may fail soon."
# High CPU usage
- alert: HighCPUUsage
expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 85
for: 10m
labels:
severity: warning
annotations:
summary: "High CPU usage detected"
description: "CPU usage is {{ $value }}% on {{ $labels.instance }}."
# Memory usage high
- alert: HighMemoryUsage
expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) > 0.85
for: 10m
labels:
severity: warning
annotations:
summary: "High memory usage"
description: "Memory usage is above 85% ({{ $value | humanizePercentage }})."
# Container CPU throttling (only Docker containers)
- alert: ContainerCPUThrottling
expr: rate(container_cpu_cfs_throttled_seconds_total{name!=""}[5m]) > 0.5
for: 5m
labels:
severity: warning
annotations:
summary: "Container is being CPU throttled"
description: "Container {{ $labels.name }} is experiencing CPU throttling."
# Container memory usage high (only Docker containers with memory limits)
- alert: ContainerMemoryHigh
expr: (container_memory_usage_bytes{name!=""} / container_spec_memory_limit_bytes{name!=""}) > 0.90 and container_spec_memory_limit_bytes{name!=""} > 0
for: 5m
labels:
severity: warning
annotations:
summary: "Container memory usage is high"
description: "Container {{ $labels.name }} is using {{ $value | humanizePercentage }} of its memory limit."
# Infrastructure alerts
- name: infrastructure_alerts
interval: 30s
rules:
# Prometheus scrape failures
- alert: PrometheusScrapeFailures
expr: rate(prometheus_target_scrapes_failed_total[5m]) > 0.1
for: 5m
labels:
severity: warning
annotations:
summary: "Prometheus scrape failures detected"
description: "Prometheus is failing to scrape {{ $labels.job }} target."
# Prometheus configuration reload failure
- alert: PrometheusConfigReloadFailed
expr: prometheus_config_last_reload_successful == 0
for: 1m
labels:
severity: warning
annotations:
summary: "Prometheus configuration reload failed"
description: "Prometheus failed to reload its configuration. Check prometheus logs."
# Alertmanager down
- alert: AlertmanagerDown
expr: up{job="alertmanager"} == 0
for: 2m
labels:
severity: critical
annotations:
summary: "Alertmanager is down"
description: "Alertmanager has been down for 2 minutes. Alerts will not be delivered!"
# Security alerts
- name: security_alerts
interval: 15s
rules:
# Possible DDoS attack
- alert: PossibleDDoSAttack
expr: rate(http_requests_total[1m]) > 1000
for: 2m
labels:
severity: critical
annotations:
summary: "Possible DDoS attack detected"
description: "Receiving {{ $value }} requests per second for 2 minutes. This may be a DDoS attack."
# Sustained high traffic
- alert: SustainedHighTraffic
expr: rate(http_requests_total[5m]) > 500
for: 10m
labels:
severity: warning
annotations:
summary: "Sustained high traffic detected"
description: "Receiving {{ $value }} requests per second for 10 minutes. Monitor for performance issues."
# Too many 4xx errors
- alert: HighClientErrorRate
expr: rate(http_requests_total{status_code=~"4.."}[5m]) > 5
for: 5m
labels:
severity: warning
annotations:
summary: "High rate of 4xx client errors"
description: "Receiving {{ $value }} client errors per second. Check for broken links or API misuse."