changemaker.lite/configs/prometheus/alerts.yml

groups:
  - name: v2_app_alerts
    interval: 30s
    rules:
      # Application availability
      - alert: ApplicationDown
        expr: up{job="changemaker-v2-api"} == 0
        for: 2m
        labels:
          severity: critical
        annotations:
          summary: "V2 API is down"
          description: "The Changemaker V2 API has been down for more than 2 minutes."

      # High error rate
      - alert: HighErrorRate
        expr: rate(http_requests_total{status_code=~"5.."}[5m]) > 0.1
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "High error rate detected"
          description: "Application is experiencing {{ $value }} errors per second."

      # Email queue backing up
      - alert: EmailQueueBacklog
        expr: cm_email_queue_size > 100
        for: 10m
        labels:
          severity: warning
        annotations:
          summary: "Email queue has significant backlog"
          description: "Email queue size is {{ $value }}, emails may be delayed."

      # High email failure rate
      - alert: HighEmailFailureRate
        expr: rate(cm_emails_failed_total[5m]) / rate(cm_emails_sent_total[5m]) > 0.2
        for: 10m
        labels:
          severity: warning
        annotations:
          summary: "High email failure rate"
          description: "{{ $value | humanizePercentage }} of emails are failing to send."

      # Failed login attempts spike
      - alert: SuspiciousLoginActivity
        expr: rate(cm_login_attempts_total{status="failure"}[5m]) > 5
        for: 2m
        labels:
          severity: warning
        annotations:
          summary: "Suspicious login activity detected"
          description: "{{ $value }} failed login attempts per second detected."

      # High API latency
      - alert: HighAPILatency
        expr: histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m])) > 2
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "High API latency"
          description: "95th percentile latency is {{ $value }}s for {{ $labels.route }}."

      # External service down
      - alert: ExternalServiceDown
        expr: cm_external_service_up == 0
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "External service {{ $labels.service }} is down"
          description: "Service {{ $labels.service }} has been unreachable for 5 minutes."

  # System health alerts
  - name: system_alerts
    interval: 30s
    rules:
      # Redis down
      - alert: RedisDown
        expr: redis_up == 0
        for: 1m
        labels:
          severity: critical
        annotations:
          summary: "Redis cache is down"
          description: "Redis has been down for more than 1 minute. Caching and session management will fail."

      # Disk space running low
      - alert: DiskSpaceLow
        expr: (node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"}) < 0.15
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "Disk space is running low"
          description: "Only {{ $value | humanizePercentage }} disk space remaining on root filesystem."

      # Disk space critical
      - alert: DiskSpaceCritical
        expr: (node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"}) < 0.10
        for: 2m
        labels:
          severity: critical
        annotations:
          summary: "CRITICAL: Disk space nearly exhausted"
          description: "Only {{ $value | humanizePercentage }} disk space remaining! System may fail soon."

      # High CPU usage
      - alert: HighCPUUsage
        expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 85
        for: 10m
        labels:
          severity: warning
        annotations:
          summary: "High CPU usage detected"
          description: "CPU usage is {{ $value }}% on {{ $labels.instance }}."

      # Memory usage high
      - alert: HighMemoryUsage
        expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) > 0.85
        for: 10m
        labels:
          severity: warning
        annotations:
          summary: "High memory usage"
          description: "Memory usage is above 85% ({{ $value | humanizePercentage }})."

      # Container CPU throttling
      - alert: ContainerCPUThrottling
        expr: rate(container_cpu_cfs_throttled_seconds_total[5m]) > 0.5
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "Container is being CPU throttled"
          description: "Container {{ $labels.name }} is experiencing CPU throttling."

      # Container memory usage high
      - alert: ContainerMemoryHigh
        expr: (container_memory_usage_bytes / container_spec_memory_limit_bytes) > 0.90
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "Container memory usage is high"
          description: "Container {{ $labels.name }} is using {{ $value | humanizePercentage }} of its memory limit."

  # Infrastructure alerts
  - name: infrastructure_alerts
    interval: 30s
    rules:
      # Prometheus scrape failures
      - alert: PrometheusScrapeFailures
        expr: rate(prometheus_target_scrapes_failed_total[5m]) > 0.1
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "Prometheus scrape failures detected"
          description: "Prometheus is failing to scrape {{ $labels.job }} target."

      # Prometheus configuration reload failure
      - alert: PrometheusConfigReloadFailed
        expr: prometheus_config_last_reload_successful == 0
        for: 1m
        labels:
          severity: warning
        annotations:
          summary: "Prometheus configuration reload failed"
          description: "Prometheus failed to reload its configuration. Check prometheus logs."

      # Alertmanager down
      - alert: AlertmanagerDown
        expr: up{job="alertmanager"} == 0
        for: 2m
        labels:
          severity: critical
        annotations:
          summary: "Alertmanager is down"
          description: "Alertmanager has been down for 2 minutes. Alerts will not be delivered!"

  # Security alerts
  - name: security_alerts
    interval: 15s
    rules:
      # Possible DDoS attack
      - alert: PossibleDDoSAttack
        expr: rate(http_requests_total[1m]) > 1000
        for: 2m
        labels:
          severity: critical
        annotations:
          summary: "Possible DDoS attack detected"
          description: "Receiving {{ $value }} requests per second for 2 minutes. This may be a DDoS attack."

      # Sustained high traffic
      - alert: SustainedHighTraffic
        expr: rate(http_requests_total[5m]) > 500
        for: 10m
        labels:
          severity: warning
        annotations:
          summary: "Sustained high traffic detected"
          description: "Receiving {{ $value }} requests per second for 10 minutes. Monitor for performance issues."

      # Too many 4xx errors
      - alert: HighClientErrorRate
        expr: rate(http_requests_total{status_code=~"4.."}[5m]) > 5
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "High rate of 4xx client errors"
          description: "Receiving {{ $value }} client errors per second. Check for broken links or API misuse."