Phase 1-14 complete: - Unified Express.js API (TypeScript, Prisma ORM, PostgreSQL 16) - React Admin GUI (Vite + Ant Design + Zustand) - JWT auth with refresh tokens - Influence: Campaigns, Representatives, Responses, Email Queue - Map: Locations, Cuts, Shifts, Canvassing System - NAR data import infrastructure (2025 format) - Listmonk newsletter integration - Landing page builder (GrapesJS) - MkDocs + Code Server integration - Volunteer portal with GPS tracking - Monitoring stack (Prometheus, Grafana, Alertmanager) - Pangolin tunnel integration Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
216 lines
7.3 KiB
YAML
216 lines
7.3 KiB
YAML
groups:
|
|
- name: v2_app_alerts
|
|
interval: 30s
|
|
rules:
|
|
# Application availability
|
|
- alert: ApplicationDown
|
|
expr: up{job="changemaker-v2-api"} == 0
|
|
for: 2m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "V2 API is down"
|
|
description: "The Changemaker V2 API has been down for more than 2 minutes."
|
|
|
|
# High error rate
|
|
- alert: HighErrorRate
|
|
expr: rate(http_requests_total{status_code=~"5.."}[5m]) > 0.1
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "High error rate detected"
|
|
description: "Application is experiencing {{ $value }} errors per second."
|
|
|
|
# Email queue backing up
|
|
- alert: EmailQueueBacklog
|
|
expr: cm_email_queue_size > 100
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Email queue has significant backlog"
|
|
description: "Email queue size is {{ $value }}, emails may be delayed."
|
|
|
|
# High email failure rate
|
|
- alert: HighEmailFailureRate
|
|
expr: rate(cm_emails_failed_total[5m]) / rate(cm_emails_sent_total[5m]) > 0.2
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "High email failure rate"
|
|
description: "{{ $value | humanizePercentage }} of emails are failing to send."
|
|
|
|
# Failed login attempts spike
|
|
- alert: SuspiciousLoginActivity
|
|
expr: rate(cm_login_attempts_total{status="failure"}[5m]) > 5
|
|
for: 2m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Suspicious login activity detected"
|
|
description: "{{ $value }} failed login attempts per second detected."
|
|
|
|
# High API latency
|
|
- alert: HighAPILatency
|
|
expr: histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m])) > 2
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "High API latency"
|
|
description: "95th percentile latency is {{ $value }}s for {{ $labels.route }}."
|
|
|
|
# External service down
|
|
- alert: ExternalServiceDown
|
|
expr: cm_external_service_up == 0
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "External service {{ $labels.service }} is down"
|
|
description: "Service {{ $labels.service }} has been unreachable for 5 minutes."
|
|
|
|
# System health alerts
|
|
- name: system_alerts
|
|
interval: 30s
|
|
rules:
|
|
# Redis down
|
|
- alert: RedisDown
|
|
expr: redis_up == 0
|
|
for: 1m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Redis cache is down"
|
|
description: "Redis has been down for more than 1 minute. Caching and session management will fail."
|
|
|
|
# Disk space running low
|
|
- alert: DiskSpaceLow
|
|
expr: (node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"}) < 0.15
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Disk space is running low"
|
|
description: "Only {{ $value | humanizePercentage }} disk space remaining on root filesystem."
|
|
|
|
# Disk space critical
|
|
- alert: DiskSpaceCritical
|
|
expr: (node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"}) < 0.10
|
|
for: 2m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "CRITICAL: Disk space nearly exhausted"
|
|
description: "Only {{ $value | humanizePercentage }} disk space remaining! System may fail soon."
|
|
|
|
# High CPU usage
|
|
- alert: HighCPUUsage
|
|
expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 85
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "High CPU usage detected"
|
|
description: "CPU usage is {{ $value }}% on {{ $labels.instance }}."
|
|
|
|
# Memory usage high
|
|
- alert: HighMemoryUsage
|
|
expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) > 0.85
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "High memory usage"
|
|
description: "Memory usage is above 85% ({{ $value | humanizePercentage }})."
|
|
|
|
# Container CPU throttling
|
|
- alert: ContainerCPUThrottling
|
|
expr: rate(container_cpu_cfs_throttled_seconds_total[5m]) > 0.5
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Container is being CPU throttled"
|
|
description: "Container {{ $labels.name }} is experiencing CPU throttling."
|
|
|
|
# Container memory usage high
|
|
- alert: ContainerMemoryHigh
|
|
expr: (container_memory_usage_bytes / container_spec_memory_limit_bytes) > 0.90
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Container memory usage is high"
|
|
description: "Container {{ $labels.name }} is using {{ $value | humanizePercentage }} of its memory limit."
|
|
|
|
# Infrastructure alerts
|
|
- name: infrastructure_alerts
|
|
interval: 30s
|
|
rules:
|
|
# Prometheus scrape failures
|
|
- alert: PrometheusScrapeFailures
|
|
expr: rate(prometheus_target_scrapes_failed_total[5m]) > 0.1
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Prometheus scrape failures detected"
|
|
description: "Prometheus is failing to scrape {{ $labels.job }} target."
|
|
|
|
# Prometheus configuration reload failure
|
|
- alert: PrometheusConfigReloadFailed
|
|
expr: prometheus_config_last_reload_successful == 0
|
|
for: 1m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Prometheus configuration reload failed"
|
|
description: "Prometheus failed to reload its configuration. Check prometheus logs."
|
|
|
|
# Alertmanager down
|
|
- alert: AlertmanagerDown
|
|
expr: up{job="alertmanager"} == 0
|
|
for: 2m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Alertmanager is down"
|
|
description: "Alertmanager has been down for 2 minutes. Alerts will not be delivered!"
|
|
|
|
# Security alerts
|
|
- name: security_alerts
|
|
interval: 15s
|
|
rules:
|
|
# Possible DDoS attack
|
|
- alert: PossibleDDoSAttack
|
|
expr: rate(http_requests_total[1m]) > 1000
|
|
for: 2m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Possible DDoS attack detected"
|
|
description: "Receiving {{ $value }} requests per second for 2 minutes. This may be a DDoS attack."
|
|
|
|
# Sustained high traffic
|
|
- alert: SustainedHighTraffic
|
|
expr: rate(http_requests_total[5m]) > 500
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Sustained high traffic detected"
|
|
description: "Receiving {{ $value }} requests per second for 10 minutes. Monitor for performance issues."
|
|
|
|
# Too many 4xx errors
|
|
- alert: HighClientErrorRate
|
|
expr: rate(http_requests_total{status_code=~"4.."}[5m]) > 5
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "High rate of 4xx client errors"
|
|
description: "Receiving {{ $value }} client errors per second. Check for broken links or API misuse."
|