Add database migration phase and stale volume detection to upgrade script

Inserts Phase 5 (Database Migration) between container rebuild and service restart. Detects failed/incomplete Prisma migrations via _prisma_migrations query and auto-resolves them before running migrate deploy in a one-off container — catching errors in the script rather than letting the API enter a restart loop. Also detects when package.json/package-lock.json changed and removes old API/admin containers to prevent stale anonymous volumes from shadowing updated node_modules. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-09 12:19:53 -06:00 · 2026-03-09 12:19:53 -06:00 · b061e2ce61
commit b061e2ce61
parent ef11f94e76
1 changed files with 102 additions and 26 deletions
--- a/scripts/upgrade.sh
+++ b/scripts/upgrade.sh
@ -749,28 +749,11 @@ if [[ "$PULL_SERVICES" == "true" ]]; then
 fi

 # =============================================================================
-# Phase 5: Service Restart
+# Phase 5: Database Migration
 # =============================================================================

-phase "5" "Service Restart"
-write_progress 5 "Service Restart" 70 "Restarting services..."
-
-# Stop application containers
-info "Stopping application containers..."
-docker compose stop $APP_CONTAINERS 2>/dev/null || true
-success "Application containers stopped"
-
-# Force-recreate LSIO containers to prevent anonymous volume shadowing bind mounts.
-# LSIO images define a VOLUME at /config in their Dockerfile. When a container is
-# merely restarted, Docker reuses the old anonymous volume whose /config/www is empty,
-# which shadows the bind mount (e.g., ./mkdocs/site:/config/www → 403 Forbidden).
-# Removing the container first ensures a fresh anonymous volume that respects bind mounts.
-info "Removing LSIO containers (clearing anonymous volumes)..."
-docker compose rm -sf $LSIO_VOLUME_CONTAINERS 2>/dev/null || true
-success "LSIO containers cleared for fresh recreation"
-
-# Verify Gancio config.json exists before starting services
-verify_gancio_config
+phase "5" "Database Migration"
+write_progress 5 "Database Migration" 55 "Checking database state..."

 # Ensure infrastructure is running and healthy
 info "Ensuring infrastructure is up..."
@ -790,8 +773,101 @@ while ! docker compose exec -T v2-postgres pg_isready -U "${V2_POSTGRES_USER:-ch
 done
 success "PostgreSQL ready (${PG_WAIT}s)"

-# Start API first (entrypoint runs prisma db push + seed)
-info "Starting API (migrations will auto-apply)..."
+# Check for failed/incomplete migrations
+info "Checking for failed migrations..."
+FAILED_MIGRATIONS="$(docker compose exec -T v2-postgres psql -U "${V2_POSTGRES_USER:-changemaker}" -d "${V2_POSTGRES_DB:-changemaker_v2}" -t -A -c "
+  SELECT migration_name FROM _prisma_migrations
+  WHERE rolled_back_at IS NOT NULL
+     OR (finished_at IS NULL AND started_at IS NOT NULL
+         AND started_at < NOW() - INTERVAL '10 minutes')
+" 2>/dev/null || true)"
+
+if [[ -n "$FAILED_MIGRATIONS" ]]; then
+  warn "Found failed/incomplete migrations — auto-resolving..."
+  while IFS= read -r migration_name; do
+    [[ -z "$migration_name" ]] && continue
+    info "  Resolving: $migration_name"
+    docker compose run --rm --no-deps --entrypoint "" api \
+      npx prisma migrate resolve --applied "$migration_name" 2>&1 || {
+      warn "  Could not auto-resolve $migration_name (may need manual intervention)"
+    }
+  done <<< "$FAILED_MIGRATIONS"
+  success "Failed migrations resolved"
+else
+  success "No failed migrations found"
+fi
+
+# Run migrations in a one-off container (catches errors here, not in a restart loop)
+info "Running database migrations..."
+write_progress 5 "Database Migration" 60 "Applying migrations..."
+if ! docker compose run --rm --no-deps --entrypoint "" api \
+  npx prisma migrate deploy 2>&1; then
+  error "Database migration failed!"
+  error ""
+  error "Common fixes:"
+  error "  1. Check migration status:"
+  error "     docker compose exec v2-postgres psql -U changemaker -d changemaker_v2 \\"
+  error "       -c \"SELECT migration_name, finished_at, rolled_back_at FROM _prisma_migrations ORDER BY started_at DESC LIMIT 10;\""
+  error "  2. Mark a stuck migration as applied:"
+  error "     docker compose run --rm --no-deps --entrypoint '' api npx prisma migrate resolve --applied <migration_name>"
+  error "  3. Check logs: docker compose logs api --tail 50"
+  error ""
+  error "After fixing, re-run: ./scripts/upgrade.sh --force --skip-backup"
+  exit 1
+fi
+
+# Count applied migrations
+MIGRATION_COUNT="$(docker compose exec -T v2-postgres psql -U "${V2_POSTGRES_USER:-changemaker}" -d "${V2_POSTGRES_DB:-changemaker_v2}" -t -A -c "
+  SELECT COUNT(*) FROM _prisma_migrations WHERE finished_at IS NOT NULL
+" 2>/dev/null || echo "?")"
+success "Migrations up to date ($MIGRATION_COUNT total applied)"
+
+# Run database seed (idempotent)
+info "Running database seed..."
+write_progress 5 "Database Migration" 65 "Seeding database..."
+if ! docker compose run --rm --no-deps --entrypoint "" api \
+  npx prisma db seed 2>&1; then
+  warn "Database seed had warnings (non-fatal, continuing)"
+fi
+success "Database seed complete"
+
+# =============================================================================
+# Phase 6: Service Restart
+# =============================================================================
+
+phase "6" "Service Restart"
+write_progress 6 "Service Restart" 70 "Restarting services..."
+
+# Stop application containers
+info "Stopping application containers..."
+docker compose stop $APP_CONTAINERS 2>/dev/null || true
+success "Application containers stopped"
+
+# Force-recreate LSIO containers to prevent anonymous volume shadowing bind mounts.
+# LSIO images define a VOLUME at /config in their Dockerfile. When a container is
+# merely restarted, Docker reuses the old anonymous volume whose /config/www is empty,
+# which shadows the bind mount (e.g., ./mkdocs/site:/config/www → 403 Forbidden).
+# Removing the container first ensures a fresh anonymous volume that respects bind mounts.
+info "Removing LSIO containers (clearing anonymous volumes)..."
+docker compose rm -sf $LSIO_VOLUME_CONTAINERS 2>/dev/null || true
+success "LSIO containers cleared for fresh recreation"
+
+# Verify Gancio config.json exists before starting services
+verify_gancio_config
+
+# Detect if npm dependencies changed (stale anonymous volumes cause missing modules)
+NEEDS_VOLUME_REFRESH=false
+if echo "$CHANGED_FILES" | grep -qE "^(api|admin)/(package\.json|package-lock\.json)"; then
+  NEEDS_VOLUME_REFRESH=true
+  warn "Package dependencies changed — will recreate containers with fresh volumes"
+fi
+
+# Start API (migrations already applied in Phase 5)
+info "Starting API..."
+if [[ "$NEEDS_VOLUME_REFRESH" == "true" ]]; then
+  info "Removing old API/admin containers (clearing stale node_modules volumes)..."
+  docker compose rm -sf api admin 2>/dev/null || true
+fi
 docker compose up -d api

 # Poll API health check
@ -840,11 +916,11 @@ if [[ "$MONITORING_WAS_RUNNING" == "true" ]]; then
 fi

 # =============================================================================
-# Phase 6: Post-Upgrade Verification
+# Phase 7: Post-Upgrade Verification
 # =============================================================================

-phase "6" "Post-Upgrade Verification"
-write_progress 6 "Verification" 90 "Running health checks..."
+phase "7" "Post-Upgrade Verification"
+write_progress 7 "Verification" 90 "Running health checks..."

 VERIFY_FAILED=false

@ -924,7 +1000,7 @@ if [[ "$VERIFY_FAILED" == "true" ]]; then
  UPGRADE_WARNINGS='["Some health checks failed after upgrade — services may still be starting"]'
 fi

-write_progress 6 "Verification" 100 "Upgrade complete!"
+write_progress 7 "Verification" 100 "Upgrade complete!"
 write_result "true" "Upgraded ${PRE_UPGRADE_SHORT} → ${FINAL_COMMIT} (${COMMIT_COUNT} commits)" "$UPGRADE_WARNINGS"

 echo ""