bunker-admin 99a6abab06 Add video card insert feature + MkDocs video hydration + fixes
- New video card block for GrapesJS landing pages, email templates,
  MkDocs export, and documentation editor Insert dropdown
- Shared HTML generators in admin/src/utils/videoCardHtml.ts
- MkDocs video-player.js hydrates .video-card-block elements:
  thumbnail fix via MEDIA_API_URL, click-to-play inline, Gallery link
- Media API CORS: auto-add MkDocs + docs subdomain origins
- env_config_hook.py: smart Docker hostname detection, ADMIN_PORT
  resolution, pass env vars to MkDocs container
- Gallery URL uses /gallery?expanded=ID format
- VideoPickerModal: fix double /api prefix and Docker hostname thumbs
- Seed: default-video-card PageBlock
- Remove V1 legacy code (influence/, map/)

Bunker Admin
2026-02-17 15:42:32 -07:00

216 lines
7.5 KiB
YAML

groups:
- name: v2_app_alerts
interval: 30s
rules:
# Application availability
- alert: ApplicationDown
expr: up{job="changemaker-v2-api"} == 0
for: 2m
labels:
severity: critical
annotations:
summary: "V2 API is down"
description: "The Changemaker V2 API has been down for more than 2 minutes."
# High error rate
- alert: HighErrorRate
expr: rate(http_requests_total{status_code=~"5.."}[5m]) > 0.1
for: 5m
labels:
severity: warning
annotations:
summary: "High error rate detected"
description: "Application is experiencing {{ $value }} errors per second."
# Email queue backing up
- alert: EmailQueueBacklog
expr: cm_email_queue_size > 100
for: 10m
labels:
severity: warning
annotations:
summary: "Email queue has significant backlog"
description: "Email queue size is {{ $value }}, emails may be delayed."
# High email failure rate
- alert: HighEmailFailureRate
expr: rate(cm_emails_failed_total[5m]) / rate(cm_emails_sent_total[5m]) > 0.2
for: 10m
labels:
severity: warning
annotations:
summary: "High email failure rate"
description: "{{ $value | humanizePercentage }} of emails are failing to send."
# Failed login attempts spike
- alert: SuspiciousLoginActivity
expr: rate(cm_login_attempts_total{status="failure"}[5m]) > 5
for: 2m
labels:
severity: warning
annotations:
summary: "Suspicious login activity detected"
description: "{{ $value }} failed login attempts per second detected."
# High API latency
- alert: HighAPILatency
expr: histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m])) > 2
for: 5m
labels:
severity: warning
annotations:
summary: "High API latency"
description: "95th percentile latency is {{ $value }}s for {{ $labels.route }}."
# External service down
- alert: ExternalServiceDown
expr: cm_external_service_up == 0
for: 5m
labels:
severity: warning
annotations:
summary: "External service {{ $labels.service }} is down"
description: "Service {{ $labels.service }} has been unreachable for 5 minutes."
# System health alerts
- name: system_alerts
interval: 30s
rules:
# Redis down
- alert: RedisDown
expr: redis_up == 0
for: 1m
labels:
severity: critical
annotations:
summary: "Redis cache is down"
description: "Redis has been down for more than 1 minute. Caching and session management will fail."
# Disk space running low
- alert: DiskSpaceLow
expr: (node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"}) < 0.15
for: 5m
labels:
severity: warning
annotations:
summary: "Disk space is running low"
description: "Only {{ $value | humanizePercentage }} disk space remaining on root filesystem."
# Disk space critical
- alert: DiskSpaceCritical
expr: (node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"}) < 0.10
for: 2m
labels:
severity: critical
annotations:
summary: "CRITICAL: Disk space nearly exhausted"
description: "Only {{ $value | humanizePercentage }} disk space remaining! System may fail soon."
# High CPU usage
- alert: HighCPUUsage
expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 85
for: 10m
labels:
severity: warning
annotations:
summary: "High CPU usage detected"
description: "CPU usage is {{ $value }}% on {{ $labels.instance }}."
# Memory usage high
- alert: HighMemoryUsage
expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) > 0.85
for: 10m
labels:
severity: warning
annotations:
summary: "High memory usage"
description: "Memory usage is above 85% ({{ $value | humanizePercentage }})."
# Container CPU throttling (only Docker containers)
- alert: ContainerCPUThrottling
expr: rate(container_cpu_cfs_throttled_seconds_total{name!=""}[5m]) > 0.5
for: 5m
labels:
severity: warning
annotations:
summary: "Container is being CPU throttled"
description: "Container {{ $labels.name }} is experiencing CPU throttling."
# Container memory usage high (only Docker containers with memory limits)
- alert: ContainerMemoryHigh
expr: (container_memory_usage_bytes{name!=""} / container_spec_memory_limit_bytes{name!=""}) > 0.90 and container_spec_memory_limit_bytes{name!=""} > 0
for: 5m
labels:
severity: warning
annotations:
summary: "Container memory usage is high"
description: "Container {{ $labels.name }} is using {{ $value | humanizePercentage }} of its memory limit."
# Infrastructure alerts
- name: infrastructure_alerts
interval: 30s
rules:
# Prometheus scrape failures
- alert: PrometheusScrapeFailures
expr: rate(prometheus_target_scrapes_failed_total[5m]) > 0.1
for: 5m
labels:
severity: warning
annotations:
summary: "Prometheus scrape failures detected"
description: "Prometheus is failing to scrape {{ $labels.job }} target."
# Prometheus configuration reload failure
- alert: PrometheusConfigReloadFailed
expr: prometheus_config_last_reload_successful == 0
for: 1m
labels:
severity: warning
annotations:
summary: "Prometheus configuration reload failed"
description: "Prometheus failed to reload its configuration. Check prometheus logs."
# Alertmanager down
- alert: AlertmanagerDown
expr: up{job="alertmanager"} == 0
for: 2m
labels:
severity: critical
annotations:
summary: "Alertmanager is down"
description: "Alertmanager has been down for 2 minutes. Alerts will not be delivered!"
# Security alerts
- name: security_alerts
interval: 15s
rules:
# Possible DDoS attack
- alert: PossibleDDoSAttack
expr: rate(http_requests_total[1m]) > 1000
for: 2m
labels:
severity: critical
annotations:
summary: "Possible DDoS attack detected"
description: "Receiving {{ $value }} requests per second for 2 minutes. This may be a DDoS attack."
# Sustained high traffic
- alert: SustainedHighTraffic
expr: rate(http_requests_total[5m]) > 500
for: 10m
labels:
severity: warning
annotations:
summary: "Sustained high traffic detected"
description: "Receiving {{ $value }} requests per second for 10 minutes. Monitor for performance issues."
# Too many 4xx errors
- alert: HighClientErrorRate
expr: rate(http_requests_total{status_code=~"4.."}[5m]) > 5
for: 5m
labels:
severity: warning
annotations:
summary: "High rate of 4xx client errors"
description: "Receiving {{ $value }} client errors per second. Check for broken links or API misuse."