bunker-admin 13513aeca5 Fix VERSION promotion regression: don't gate on soft health-check warnings
Prior commit (ac901c9e, Fix B) gated VERSION.pending promotion behind
VERIFY_FAILED=false, but VERIFY_FAILED is a soft warning signal — it
also fires when the admin container's 30s verify budget is tight
(which was the cry-wolf case Fix 3 addressed in the same commit).

Observed on marcelle during v2.9.5 → v2.9.6: the upgrade completed
successfully (tarball extracted, containers pulled and running new
image), but because the admin healthcheck warned at 30s (still using
v2.9.5's upgrade.sh with its 30s budget), VERIFY_FAILED=true pinned
VERSION back to v2.9.5 despite everything else having advanced. result.json showed success=true but newCommit=v2.9.5.

Hard failures still prevent promotion via on_failure's rm -f of
VERSION.pending before the promotion site is reached. Reaching the
promotion site means Phase 7 completed without exit-code or trap —
that's the correct gate.

Bunker Admin
2026-04-15 18:33:13 -06:00

1456 lines
56 KiB
Bash
Executable File

#!/usr/bin/env bash
# =============================================================================
# Changemaker Lite V2 — Upgrade Script
# Safely pulls updates, rebuilds containers, and restarts services.
# Usage: ./scripts/upgrade.sh [OPTIONS]
# =============================================================================
set -euo pipefail
# --- Configuration ---
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
PROJECT_DIR="$(dirname "$SCRIPT_DIR")"
TIMESTAMP="$(date +%Y%m%d_%H%M%S)"
LOG_DIR="${PROJECT_DIR}/logs"
LOG_FILE="${LOG_DIR}/upgrade-${TIMESTAMP}.log"
LOCK_FILE="${PROJECT_DIR}/.upgrade.lock"
BACKUP_DIR="${BACKUP_DIR:-$PROJECT_DIR/backups}"
HEALTH_TIMEOUT=120
HEALTH_INTERVAL=5
MIN_DISK_MB=2048
# Tracks which phase the upgrade is currently in, so on_failure can report
# "killed during Phase 4: Container Rebuild" instead of just an exit code.
CURRENT_PHASE_NAME=""
# Warnings accumulated during the run — surfaced in result.json. Global so
# Phase 7 probes (external reachability) can append without losing earlier
# entries set in the Phase 7 verification block.
UPGRADE_WARNINGS="[]"
# Source-built containers (always rebuilt)
SOURCE_CONTAINERS="api admin media-api"
# Conditionally rebuilt if Dockerfile changed
CONDITIONAL_CONTAINERS="nginx"
# App containers stopped during upgrade
APP_CONTAINERS="api admin media-api nginx"
# Infrastructure containers (must stay up)
INFRA_CONTAINERS="v2-postgres redis"
# LSIO containers with anonymous /config volumes (must be force-recreated on upgrade
# to prevent stale anonymous volumes from shadowing bind mounts underneath /config)
LSIO_VOLUME_CONTAINERS="mkdocs-site-server"
# User-modifiable paths (auto-resolve keeps user version on conflict)
USER_PATHS=(
"mkdocs/docs/"
"mkdocs/mkdocs.yml"
"mkdocs/site/"
"configs/"
"nginx/conf.d/services.conf"
)
# --- Detect install mode ---
if [[ -f "$PROJECT_DIR/VERSION" ]] && [[ ! -d "$PROJECT_DIR/.git" ]]; then
INSTALL_MODE="release"
else
INSTALL_MODE="source"
fi
# --- Defaults ---
SKIP_BACKUP=false
PULL_SERVICES=false
DRY_RUN=false
FORCE=false
BRANCH=""
ROLLBACK=false
API_MODE=false
USE_REGISTRY=false
# Release installs always use registry mode
if [[ "$INSTALL_MODE" == "release" ]]; then
USE_REGISTRY=true
fi
# --- Colors (respects NO_COLOR convention) ---
if [[ -t 1 ]] && [[ -z "${NO_COLOR:-}" ]]; then
RED='\033[0;31m' GREEN='\033[0;32m' YELLOW='\033[0;33m'
BLUE='\033[0;34m' CYAN='\033[0;36m' BOLD='\033[1m'
DIM='\033[2m' NC='\033[0m'
else
RED='' GREEN='' YELLOW='' BLUE='' CYAN='' BOLD='' DIM='' NC=''
fi
# =============================================================================
# Utility Functions
# =============================================================================
info() { echo -e "${CYAN}[INFO]${NC} $*"; }
success() { echo -e "${GREEN}[ OK ]${NC} $*"; }
warn() { echo -e "${YELLOW}[WARN]${NC} $*"; }
error() { echo -e "${RED}[ERR ]${NC} $*" >&2; }
phase() {
echo ""
echo -e "${BOLD}${BLUE}═══════════════════════════════════════════════${NC}"
echo -e "${BOLD}${BLUE} Phase $1: $2${NC}"
echo -e "${BOLD}${BLUE}═══════════════════════════════════════════════${NC}"
echo ""
}
# --- API mode: JSON progress/result writing ---
UPGRADE_DIR="${PROJECT_DIR}/data/upgrade"
PROGRESS_FILE="${UPGRADE_DIR}/progress.json"
RESULT_FILE="${UPGRADE_DIR}/result.json"
write_progress() {
local phase_num="$1" phase_name="$2" pct="$3" msg="$4"
# Track phase name for on_failure regardless of API_MODE — useful for logs too.
CURRENT_PHASE_NAME="$phase_name"
[[ "$API_MODE" != "true" ]] && return
mkdir -p "$UPGRADE_DIR"
cat > "$PROGRESS_FILE" <<PEOF
{
"phase": ${phase_num},
"phaseName": "${phase_name}",
"percentage": ${pct},
"message": "$(echo "$msg" | sed 's/"/\\"/g')",
"lastUpdate": "$(date -u +%Y-%m-%dT%H:%M:%SZ)"
}
PEOF
}
write_result() {
[[ "$API_MODE" != "true" ]] && return
local success="$1" msg="$2"
local duration_secs=$((SECONDS - START_TIME))
local warnings_json="${3:-[]}"
mkdir -p "$UPGRADE_DIR"
cat > "$RESULT_FILE" <<REOF
{
"success": ${success},
"message": "$(echo "$msg" | sed 's/"/\\"/g')",
"previousCommit": "${PRE_UPGRADE_SHORT:-unknown}",
"newCommit": "$(head -1 "$PROJECT_DIR/VERSION" 2>/dev/null || git rev-parse --short HEAD 2>/dev/null || echo "unknown")",
"commitCount": ${COMMIT_COUNT:-0},
"durationSeconds": ${duration_secs},
"warnings": ${warnings_json},
"completedAt": "$(date -u +%Y-%m-%dT%H:%M:%SZ)"
}
REOF
# Clean up progress file
rm -f "$PROGRESS_FILE"
# Update status.json with new commit info
if [[ -x "$SCRIPT_DIR/upgrade-check.sh" ]]; then
"$SCRIPT_DIR/upgrade-check.sh" 2>/dev/null || true
fi
}
elapsed() {
local secs=$((SECONDS - START_TIME))
printf '%dm %ds' $((secs / 60)) $((secs % 60))
}
# --- Save/restore user-modifiable paths across git pull ---
save_user_paths() {
USER_SAVE_DIR="$(mktemp -d)"
for p in "${USER_PATHS[@]}"; do
if [[ -e "$PROJECT_DIR/$p" ]]; then
mkdir -p "$USER_SAVE_DIR/$(dirname "$p")"
cp -a "$PROJECT_DIR/$p" "$USER_SAVE_DIR/$p"
fi
done
}
restore_user_paths() {
if [[ -z "${USER_SAVE_DIR:-}" ]] || [[ ! -d "${USER_SAVE_DIR:-}" ]]; then
return
fi
local restored=0
for p in "${USER_PATHS[@]}"; do
if [[ -e "$USER_SAVE_DIR/$p" ]]; then
# Ensure parent directory exists (in case pull deleted it)
mkdir -p "$PROJECT_DIR/$(dirname "$p")"
# Use docker alpine to remove if regular rm fails (root-owned files from containers)
rm -rf "$PROJECT_DIR/$p" 2>/dev/null || \
docker run --rm -v "$PROJECT_DIR:/project" alpine rm -rf "/project/$p" 2>/dev/null || true
cp -a "$USER_SAVE_DIR/$p" "$PROJECT_DIR/$p"
restored=$((restored + 1))
fi
done
rm -rf "$USER_SAVE_DIR"
if [[ $restored -gt 0 ]]; then
success "Restored $restored user-modifiable path(s)"
fi
}
# --- Verify Gancio config.json in its data volume ---
# Gancio uses a named Docker volume for /home/node/data. If the volume loses
# config.json (e.g., volume name prefix mismatch after compose project rename),
# Gancio detects an existing DB but no config and refuses to start with:
# "Non empty db! Please move your current db elsewhere than retry."
# This regenerates config.json from .env vars when missing.
verify_gancio_config() {
local gancio_volume
gancio_volume="$(docker volume ls --format '{{.Name}}' | grep 'gancio-data' | head -1 || true)"
if [[ -z "$gancio_volume" ]]; then
return # No gancio volume exists yet; first run will handle it
fi
# Check if config.json exists and is non-empty
if docker run --rm -v "${gancio_volume}:/data" alpine test -s /data/config.json 2>/dev/null; then
success "Gancio config.json present in $gancio_volume"
return
fi
warn "Gancio config.json missing in volume $gancio_volume — regenerating from .env"
local base_url="${GANCIO_BASE_URL:-https://events.cmlite.org}"
local pg_user="${V2_POSTGRES_USER:-changemaker}"
local pg_pass="${V2_POSTGRES_PASSWORD:-changemaker}"
local config_json="{\"baseurl\":\"${base_url}\",\"server\":{\"host\":\"0.0.0.0\",\"port\":13120},\"db\":{\"dialect\":\"postgres\",\"host\":\"changemaker-v2-postgres\",\"port\":5432,\"database\":\"gancio\",\"username\":\"${pg_user}\",\"password\":\"${pg_pass}\"}}"
docker run --rm -v "${gancio_volume}:/data" alpine sh -c \
"echo '${config_json}' > /data/config.json && chown 1000:1000 /data/config.json"
success "Gancio config.json regenerated"
}
# --- Lockfile ---
acquire_lock() {
if [[ -f "$LOCK_FILE" ]]; then
local old_pid
old_pid=$(cat "$LOCK_FILE" 2>/dev/null || echo "")
if [[ -n "$old_pid" ]] && kill -0 "$old_pid" 2>/dev/null; then
error "Another upgrade is running (PID $old_pid). If stale, remove $LOCK_FILE"
exit 1
fi
warn "Removing stale lock file (PID $old_pid no longer running)"
rm -f "$LOCK_FILE"
fi
echo $$ > "$LOCK_FILE"
}
release_lock() {
rm -f "$LOCK_FILE"
}
# --- .env loading (from backup.sh — handles special chars) ---
load_env() {
if [[ -f "$PROJECT_DIR/.env" ]]; then
while IFS='=' read -r key value; do
[[ -z "$key" || "$key" =~ ^[[:space:]]*# ]] && continue
key="$(echo "$key" | xargs)"
value="${value%\"}"
value="${value#\"}"
value="${value%\'}"
value="${value#\'}"
if [[ "$key" =~ ^[a-zA-Z_][a-zA-Z0-9_]*$ ]]; then
export "$key=$value"
fi
done < "$PROJECT_DIR/.env"
fi
}
# --- Print rollback instructions ---
print_rollback_help() {
local commit="${PRE_UPGRADE_COMMIT:-unknown}"
echo ""
echo -e "${BOLD}${RED}═══════════════════════════════════════════════${NC}"
echo -e "${BOLD}${RED} Upgrade Failed — Rollback Instructions${NC}"
echo -e "${BOLD}${RED}═══════════════════════════════════════════════${NC}"
echo ""
if [[ "$INSTALL_MODE" == "release" ]]; then
# Release installs have no .git — rollback is "re-download the prior tarball".
# VERSION.rollback is seeded at the start of Phase 3 so we always know what
# tag to go back to, across multiple failed attempts.
local prior
prior="$(cat "${UPGRADE_DIR}/VERSION.rollback" 2>/dev/null | head -1 || echo "vX.Y.Z")"
echo -e " ${BOLD}1.${NC} Restore prior release tarball (${BOLD}${prior}${NC}):"
echo -e " ${CYAN}cd $PROJECT_DIR${NC}"
echo -e " ${CYAN}URL=https://gitea.bnkops.com/admin/changemaker.lite/releases/download/${prior}/changemaker-lite-${prior}.tar.gz${NC}"
echo -e " ${CYAN}curl -fSL \"\$URL\" -o /tmp/rb.tar.gz && tar xzf /tmp/rb.tar.gz --strip-components=1 -C $PROJECT_DIR${NC}"
echo ""
echo -e " ${BOLD}2.${NC} Pull prior images and restart:"
echo -e " ${CYAN}docker compose pull api admin media-api nginx${NC}"
echo -e " ${CYAN}docker compose up -d${NC}"
else
echo -e " ${BOLD}1.${NC} Restore code to pre-upgrade commit:"
echo -e " ${CYAN}cd $PROJECT_DIR${NC}"
echo -e " ${CYAN}git checkout $commit${NC}"
echo ""
echo -e " ${BOLD}2.${NC} Rebuild and restart:"
echo -e " ${CYAN}docker compose build api admin media-api${NC}"
echo -e " ${CYAN}docker compose up -d${NC}"
fi
echo ""
echo -e " ${BOLD}3.${NC} If database rollback is needed (destructive!):"
echo -e " ${CYAN}# Find backup archive:${NC}"
echo -e " ${CYAN}ls -lt $BACKUP_DIR/changemaker-v2-backup-*.tar.gz | head -5${NC}"
echo -e " ${CYAN}# Extract and restore:${NC}"
echo -e " ${CYAN}tar xzf <backup>.tar.gz -C /tmp${NC}"
echo -e " ${CYAN}gunzip -c /tmp/<backup>/v2-postgres.sql.gz | docker exec -i changemaker-v2-postgres psql -U changemaker -d changemaker_v2${NC}"
echo ""
echo -e " Or use: ${CYAN}./scripts/upgrade.sh --rollback${NC}"
echo ""
}
# --- Failure trap ---
# Fires on non-zero exit OR explicit SIGTERM/SIGINT. Writes a truthful
# failure result, discards the staged VERSION (Fix B), clears progress so
# the admin UI stops showing a frozen phase, and archives to history so
# the failure is retrievable.
on_failure() {
local exit_code=$?
# Clean up user path save directory if it exists
if [[ -n "${USER_SAVE_DIR:-}" ]] && [[ -d "${USER_SAVE_DIR:-}" ]]; then
rm -rf "$USER_SAVE_DIR"
fi
# Discard staged VERSION — the bump must only happen after full success.
rm -f "${UPGRADE_DIR}/VERSION.pending" 2>/dev/null || true
release_lock
if [[ $exit_code -ne 0 ]] && [[ "$DRY_RUN" != "true" ]]; then
local phase_tag="${CURRENT_PHASE_NAME:-unknown phase}"
local fail_msg="Upgrade failed during ${phase_tag} at line ${BASH_LINENO[0]} (exit ${exit_code})"
error "$fail_msg"
# Always write the failure result — previously gated behind API_MODE,
# which meant SIGTERM during a watcher-triggered upgrade left stale
# success data in result.json.
write_result_force "false" "$fail_msg"
# Clear progress so the admin UI doesn't show a phantom in-progress phase.
rm -f "$PROGRESS_FILE" 2>/dev/null || true
# Append to history so the failure is discoverable later.
archive_failure_to_history "$fail_msg"
print_rollback_help
info "Log file: $LOG_FILE"
fi
}
# Same as write_result but bypasses the API_MODE guard. Used by on_failure
# to ensure a failure record always lands, even in non-API-mode runs.
write_result_force() {
local success="$1" msg="$2"
local duration_secs=$((SECONDS - ${START_TIME:-SECONDS}))
mkdir -p "$UPGRADE_DIR"
cat > "$RESULT_FILE" <<REOF
{
"success": ${success},
"message": "$(echo "$msg" | sed 's/"/\\"/g')",
"previousCommit": "${PRE_UPGRADE_SHORT:-unknown}",
"newCommit": "$(head -1 "$PROJECT_DIR/VERSION" 2>/dev/null || git rev-parse --short HEAD 2>/dev/null || echo "unknown")",
"commitCount": ${COMMIT_COUNT:-0},
"durationSeconds": ${duration_secs},
"warnings": ${UPGRADE_WARNINGS:-[]},
"completedAt": "$(date -u +%Y-%m-%dT%H:%M:%SZ)"
}
REOF
}
# Append a failure record to history.json (newest first, capped at 50 entries
# to match MAX_HISTORY_ENTRIES in api/src/modules/upgrade/upgrade.service.ts).
archive_failure_to_history() {
_archive_to_history "false" "$1" "[]"
}
# Mirror for success path — prior code relied on the API's handlePostRestartResult
# to archive, which only fires for auto-upgrade post-restart. Admin-UI-triggered
# successes were leaking if the user dismissed the result card before the API
# polled. API-side archiveResult dedupes on completedAt, so double-append is safe.
archive_success_to_history() {
_archive_to_history "true" "$1" "${UPGRADE_WARNINGS:-[]}"
}
_archive_to_history() {
local success="$1" msg="$2" warnings_json="$3"
local hist="${UPGRADE_DIR}/history.json"
mkdir -p "$UPGRADE_DIR"
local entry
entry="$(cat <<HEOF
{"success":${success},"message":"$(echo "$msg" | sed 's/"/\\"/g')","previousCommit":"${PRE_UPGRADE_SHORT:-unknown}","newCommit":"$(head -1 "$PROJECT_DIR/VERSION" 2>/dev/null || echo "unknown")","commitCount":${COMMIT_COUNT:-0},"durationSeconds":$((SECONDS - ${START_TIME:-SECONDS})),"warnings":${warnings_json},"completedAt":"$(date -u +%Y-%m-%dT%H:%M:%SZ)"}
HEOF
)"
python3 - "$hist" "$entry" <<'PYEOF' 2>/dev/null || true
import json, sys
hist_path, entry_json = sys.argv[1], sys.argv[2]
try:
with open(hist_path) as f:
history = json.load(f)
if not isinstance(history, list):
history = []
except Exception:
history = []
history.insert(0, json.loads(entry_json))
history = history[:50]
with open(hist_path, 'w') as f:
json.dump(history, f, indent=2)
PYEOF
}
# =============================================================================
# Parse Arguments
# =============================================================================
show_help() {
cat << 'EOF'
Changemaker Lite V2 — Upgrade Script
Usage: ./scripts/upgrade.sh [OPTIONS]
Options:
--skip-backup Skip backup phase (requires --force)
--pull-services Also pull new third-party Docker images
--use-registry Pull pre-built images from Gitea registry instead of rebuilding
--dry-run Show what would happen without executing
--force Continue past non-critical warnings
--branch BRANCH Git branch to pull (default: current branch)
--rollback Rollback to pre-upgrade commit
--api-mode Write progress/result JSON for admin UI
--help Show this help message
Examples:
./scripts/upgrade.sh # Standard upgrade (build from source)
./scripts/upgrade.sh --use-registry # Fast upgrade using pre-built Gitea images
./scripts/upgrade.sh --dry-run # Preview changes
./scripts/upgrade.sh --pull-services # Also update PostgreSQL, Redis, etc.
./scripts/upgrade.sh --rollback # Revert last upgrade
EOF
exit 0
}
while [[ $# -gt 0 ]]; do
case "$1" in
--skip-backup) SKIP_BACKUP=true; shift ;;
--pull-services) PULL_SERVICES=true; shift ;;
--dry-run) DRY_RUN=true; shift ;;
--force) FORCE=true; shift ;;
--branch) BRANCH="$2"; shift 2 ;;
--rollback) ROLLBACK=true; shift ;;
--api-mode) API_MODE=true; shift ;;
--use-registry) USE_REGISTRY=true; shift ;;
--help|-h) show_help ;;
*) error "Unknown option: $1"; echo "Run with --help for usage."; exit 1 ;;
esac
done
# Validate flag combinations
if [[ "$SKIP_BACKUP" == "true" ]] && [[ "$FORCE" != "true" ]]; then
error "--skip-backup requires --force (backup protects your data)"
exit 1
fi
# =============================================================================
# Main
# =============================================================================
START_TIME=$SECONDS
cd "$PROJECT_DIR"
# Setup logging
mkdir -p "$LOG_DIR"
exec > >(tee -a "$LOG_FILE") 2>&1
echo ""
echo -e "${BOLD}${BLUE}══════════════════════════════════════════════════${NC}"
echo -e "${BOLD}${BLUE} Changemaker Lite V2 — Upgrade${NC}"
echo -e "${BOLD}${BLUE} ${TIMESTAMP}${NC}"
echo -e "${BOLD}${BLUE}══════════════════════════════════════════════════${NC}"
if [[ "$DRY_RUN" == "true" ]]; then
echo ""
echo -e " ${YELLOW}DRY RUN — no changes will be made${NC}"
fi
trap on_failure EXIT
# Explicit SIGTERM/SIGINT traps: bash runs EXIT on these too in theory, but
# the marcelle v2.9.2 → v2.9.3 SIGTERM-kill showed no failure result was
# written. Belt-and-suspenders — worst case it fires twice, and write_result
# uses `>` so the second write is idempotent.
trap on_failure TERM INT
acquire_lock
load_env
# Determine branch (source mode only — release installs have no git)
if [[ -z "$BRANCH" ]]; then
if [[ "$INSTALL_MODE" == "release" ]]; then
BRANCH="release"
else
BRANCH="$(git rev-parse --abbrev-ref HEAD)"
fi
fi
# =============================================================================
# Rollback Mode
# =============================================================================
if [[ "$ROLLBACK" == "true" ]]; then
phase "R" "Rollback"
if [[ "$INSTALL_MODE" == "release" ]]; then
# Release-mode rollback: re-extract the prior release tarball recorded
# in VERSION.rollback (seeded at Phase 3 start of any upgrade).
PRIOR_TAG="$(cat "${UPGRADE_DIR}/VERSION.rollback" 2>/dev/null | head -1 || true)"
if [[ -z "$PRIOR_TAG" ]]; then
error "No VERSION.rollback marker found at ${UPGRADE_DIR}/VERSION.rollback"
error "Cannot determine prior release. Run: curl -fSL <prior-tarball-url> | tar xz -C $PROJECT_DIR --strip-components=1"
release_lock
exit 1
fi
info "Rolling back to prior release: ${PRIOR_TAG}"
TARBALL_URL="${GITEA_REGISTRY_URL:-https://gitea.bnkops.com}/admin/changemaker.lite/releases/download/${PRIOR_TAG}/changemaker-lite-${PRIOR_TAG}.tar.gz"
if [[ "$DRY_RUN" == "true" ]]; then
info "[DRY RUN] Would download: $TARBALL_URL"
info "[DRY RUN] Would extract to: $PROJECT_DIR (preserving .env)"
info "[DRY RUN] Would run: docker compose pull api admin media-api nginx && docker compose up -d"
release_lock
exit 0
fi
ROLLBACK_DIR="$(mktemp -d)"
if ! curl -fSL "$TARBALL_URL" -o "${ROLLBACK_DIR}/rb.tar.gz"; then
error "Failed to download prior release tarball from ${TARBALL_URL}"
rm -rf "$ROLLBACK_DIR"
release_lock
exit 1
fi
tar xzf "${ROLLBACK_DIR}/rb.tar.gz" -C "$ROLLBACK_DIR"
ROLLBACK_SRC="$(find "$ROLLBACK_DIR" -maxdepth 1 -mindepth 1 -type d | head -1)"
rsync -a --exclude='.env' "$ROLLBACK_SRC/" "$PROJECT_DIR/"
rm -rf "$ROLLBACK_DIR"
success "Code rolled back to ${PRIOR_TAG}"
export IMAGE_TAG="latest"
docker compose pull api admin media-api nginx || warn "Some images failed to pull — check registry reachability"
docker compose up -d
success "Containers restarted on ${PRIOR_TAG} images"
else
# Source-mode rollback: legacy git-based flow.
LATEST_ARCHIVE="$(ls -t "$BACKUP_DIR"/changemaker-v2-backup-*.tar.gz 2>/dev/null | head -1 || true)"
if [[ -z "$LATEST_ARCHIVE" ]]; then
error "No backup archives found in $BACKUP_DIR"
error "Cannot determine pre-upgrade commit. Manual rollback needed."
release_lock
exit 1
fi
info "Latest backup: $(basename "$LATEST_ARCHIVE")"
ARCHIVE_DIR="$(basename "$LATEST_ARCHIVE" .tar.gz)"
ROLLBACK_COMMIT="$(tar xzf "$LATEST_ARCHIVE" -O "${ARCHIVE_DIR}/git-commit.txt" 2>/dev/null || true)"
if [[ -z "$ROLLBACK_COMMIT" ]]; then
error "No git-commit.txt found in backup archive."
error "Manually specify: git checkout <commit-hash>"
release_lock
exit 1
fi
info "Rolling back to commit: $ROLLBACK_COMMIT"
if [[ "$DRY_RUN" == "true" ]]; then
info "[DRY RUN] Would run: git checkout $ROLLBACK_COMMIT"
info "[DRY RUN] Would rebuild: docker compose build $SOURCE_CONTAINERS"
info "[DRY RUN] Would restart: docker compose up -d"
release_lock
exit 0
fi
git checkout -B "$BRANCH" "$ROLLBACK_COMMIT"
docker compose build $SOURCE_CONTAINERS
docker compose up -d
success "Rolled back to $ROLLBACK_COMMIT"
echo ""
echo -e " ${BOLD}Database restore:${NC}"
echo -e " Code has been rolled back. Database was NOT rolled back."
echo -e " The backup archive contains a PostgreSQL dump."
echo -e " To restore (${RED}DESTRUCTIVE — replaces current data${NC}):"
echo ""
ARCHIVE_DIR_NAME="$(basename "$LATEST_ARCHIVE" .tar.gz)"
echo -e " ${CYAN}tar xzf $LATEST_ARCHIVE -C /tmp${NC}"
echo -e " ${CYAN}gunzip -c /tmp/$ARCHIVE_DIR_NAME/v2-postgres.sql.gz | docker exec -i changemaker-v2-postgres psql -U changemaker -d changemaker_v2${NC}"
echo ""
fi
release_lock
exit 0
fi
# =============================================================================
# Phase 1: Pre-flight Checks
# =============================================================================
phase "1" "Pre-flight Checks"
write_progress 1 "Pre-flight Checks" 5 "Verifying system requirements..."
# Docker
if command -v docker &>/dev/null; then
success "Docker: $(docker --version | head -1)"
else
error "Docker is not installed."
exit 1
fi
if docker compose version &>/dev/null; then
success "Docker Compose: $(docker compose version --short)"
else
error "Docker Compose v2 plugin not found."
exit 1
fi
# Docker daemon running
if docker info &>/dev/null 2>&1; then
success "Docker daemon running"
else
error "Docker daemon not running."
exit 1
fi
# Git
if command -v git &>/dev/null; then
success "Git: $(git --version)"
else
error "Git is not installed."
exit 1
fi
# Remote reachable (source mode only — release mode pulls from Gitea API later)
if [[ "$INSTALL_MODE" == "source" ]]; then
info "Checking git remote..."
if timeout 10 git ls-remote origin HEAD &>/dev/null 2>&1; then
success "Git remote reachable"
else
error "Cannot reach git remote. Check your network or remote configuration."
exit 1
fi
fi
# Working directory checks
if [[ ! -f "$PROJECT_DIR/docker-compose.yml" ]]; then
error "docker-compose.yml not found. Are you in the project root?"
exit 1
fi
if [[ ! -f "$PROJECT_DIR/.env" ]]; then
error ".env not found. Run ./config.sh first."
exit 1
fi
success "Project files verified"
# Disk space
AVAILABLE_MB=$(df -m "$PROJECT_DIR" | awk 'NR==2 {print $4}')
if [[ "$AVAILABLE_MB" -lt "$MIN_DISK_MB" ]]; then
error "Insufficient disk space: ${AVAILABLE_MB}MB available, ${MIN_DISK_MB}MB required."
exit 1
fi
success "Disk space: ${AVAILABLE_MB}MB available"
# Record pre-upgrade state
if [[ "$INSTALL_MODE" == "source" ]]; then
PRE_UPGRADE_COMMIT="$(git rev-parse HEAD)"
PRE_UPGRADE_SHORT="$(git rev-parse --short HEAD)"
info "Current commit: $PRE_UPGRADE_SHORT ($(git log -1 --format='%s' HEAD))"
else
# Release mode: derive "commit" from VERSION file (format: <tag>\n<sha>)
PRE_UPGRADE_COMMIT="$(head -2 "$PROJECT_DIR/VERSION" 2>/dev/null | tail -1 || echo "release")"
PRE_UPGRADE_SHORT="$(head -1 "$PROJECT_DIR/VERSION" 2>/dev/null || echo "release")"
info "Current version: $PRE_UPGRADE_SHORT"
fi
info "Target branch: $BRANCH"
# Record running containers (for restoring monitoring profile later)
MONITORING_WAS_RUNNING=false
if docker ps --format '{{.Names}}' | grep -q 'prometheus-changemaker'; then
MONITORING_WAS_RUNNING=true
info "Monitoring stack detected (will restart after upgrade)"
fi
# Source-mode-only checks: dirty files + upstream commit comparison
if [[ "$INSTALL_MODE" == "source" ]]; then
# Warn about uncommitted changes in project-owned paths
PROJECT_OWNED_PATHS="api/ admin/ docker-compose.yml"
DIRTY_PROJECT_FILES="$(git diff --name-only HEAD -- $PROJECT_OWNED_PATHS 2>/dev/null || true)"
if [[ -n "$DIRTY_PROJECT_FILES" ]]; then
warn "Uncommitted changes in project-owned files:"
echo "$DIRTY_PROJECT_FILES" | while read -r f; do echo " $f"; done
if [[ "$FORCE" != "true" ]]; then
error "Commit or stash these changes first, or use --force to continue."
exit 1
fi
warn "Continuing with --force (changes will be stashed)"
fi
# Check for available updates
LOCAL_HEAD="$(git rev-parse HEAD)"
REMOTE_HEAD="$(git ls-remote origin "$BRANCH" | cut -f1)"
if [[ "$LOCAL_HEAD" == "$REMOTE_HEAD" ]]; then
info "Already up to date ($PRE_UPGRADE_SHORT). No upstream changes."
if [[ "$FORCE" != "true" ]]; then
success "Nothing to upgrade."
release_lock
exit 0
fi
warn "Continuing with --force despite no upstream changes."
fi
fi
# Release mode: the upstream-version comparison happens later in the
# release-mode block (line ~597) which queries the Gitea Releases API.
# =============================================================================
# Phase 2: Backup
# =============================================================================
phase "2" "Backup"
write_progress 2 "Backup" 15 "Creating backup..."
if [[ "$SKIP_BACKUP" == "true" ]]; then
warn "Backup skipped (--skip-backup --force)"
else
# Run existing backup script
if [[ -x "$SCRIPT_DIR/backup.sh" ]]; then
if [[ "$DRY_RUN" == "true" ]]; then
info "[DRY RUN] Would run: scripts/backup.sh"
else
info "Running database backup..."
"$SCRIPT_DIR/backup.sh"
success "Database backup complete"
fi
else
warn "scripts/backup.sh not found or not executable, skipping database backup"
fi
# Archive user-modifiable content
USER_BACKUP="${BACKUP_DIR}/upgrade-user-content-${TIMESTAMP}.tar.gz"
USER_BACKUP_FILES=()
for p in "${USER_PATHS[@]}"; do
if [[ -e "$PROJECT_DIR/$p" ]]; then
USER_BACKUP_FILES+=("$p")
fi
done
if [[ ${#USER_BACKUP_FILES[@]} -gt 0 ]]; then
if [[ "$DRY_RUN" == "true" ]]; then
info "[DRY RUN] Would archive user content: ${USER_BACKUP_FILES[*]}"
else
mkdir -p "$BACKUP_DIR"
tar -czf "$USER_BACKUP" -C "$PROJECT_DIR" "${USER_BACKUP_FILES[@]}" 2>/dev/null || true
success "User content archived: $(du -h "$USER_BACKUP" | cut -f1)"
fi
fi
# Save pre-upgrade commit hash for rollback reference
LATEST_BACKUP="$(ls -t "$BACKUP_DIR"/changemaker-v2-backup-*.tar.gz 2>/dev/null | head -1 || true)"
if [[ -n "$LATEST_BACKUP" ]] && [[ "$DRY_RUN" != "true" ]]; then
# Append git-commit.txt into the latest backup archive
COMMIT_TMPDIR="$(mktemp -d)"
ARCHIVE_BASENAME="$(basename "$LATEST_BACKUP" .tar.gz)"
mkdir -p "$COMMIT_TMPDIR/$ARCHIVE_BASENAME"
echo "$PRE_UPGRADE_COMMIT" > "$COMMIT_TMPDIR/$ARCHIVE_BASENAME/git-commit.txt"
# Re-pack: extract, add file, recompress
tar xzf "$LATEST_BACKUP" -C "$COMMIT_TMPDIR" 2>/dev/null || true
tar czf "$LATEST_BACKUP" -C "$COMMIT_TMPDIR" "$ARCHIVE_BASENAME"
rm -rf "$COMMIT_TMPDIR"
success "Saved commit reference ($PRE_UPGRADE_SHORT) in backup archive"
fi
fi
# =============================================================================
# Phase 3: Code Update
# =============================================================================
phase "3" "Code Update"
write_progress 3 "Code Update" 30 "Pulling latest code..."
# --- Release mode: download tarball instead of git pull ---
if [[ "$INSTALL_MODE" == "release" ]]; then
GITEA_API="${GITEA_REGISTRY_URL:-https://gitea.bnkops.com}/api/v1"
CURRENT_VERSION=$(head -1 "$PROJECT_DIR/VERSION" 2>/dev/null || echo "unknown")
info "Release mode — checking for updates (current: ${CURRENT_VERSION})..."
RELEASE_JSON=$(curl -sf "${GITEA_API}/repos/admin/changemaker.lite/releases/latest" 2>/dev/null || true)
if [[ -z "$RELEASE_JSON" ]]; then
error "Could not reach Gitea API. Check network or GITEA_REGISTRY_URL."
exit 1
fi
LATEST_TAG=$(echo "$RELEASE_JSON" | python3 -c "import sys,json; print(json.load(sys.stdin).get('tag_name',''))" 2>/dev/null)
TARBALL_URL=$(echo "$RELEASE_JSON" | python3 -c "
import sys, json
for a in json.load(sys.stdin).get('assets', []):
if a['name'].endswith('.tar.gz'):
print(a['browser_download_url']); break
" 2>/dev/null || true)
if [[ "$CURRENT_VERSION" == "$LATEST_TAG" ]] && [[ "$FORCE" != "true" ]]; then
info "Already at latest version: ${CURRENT_VERSION}"
write_progress 3 "Code Update" 45 "Already up to date"
elif [[ -z "$TARBALL_URL" ]]; then
error "No tarball found in release ${LATEST_TAG}"
exit 1
else
info "Updating ${CURRENT_VERSION}${LATEST_TAG}..."
write_progress 3 "Code Update" 35 "Downloading ${LATEST_TAG}..."
# Download
DOWNLOAD_DIR=$(mktemp -d)
curl -fSL "$TARBALL_URL" -o "${DOWNLOAD_DIR}/update.tar.gz"
tar xzf "${DOWNLOAD_DIR}/update.tar.gz" -C "$DOWNLOAD_DIR"
UPDATE_SRC=$(find "$DOWNLOAD_DIR" -maxdepth 1 -mindepth 1 -type d | head -1)
# Save user paths
save_user_paths
# Sync new files, preserving .env. VERSION is staged to a pending
# location and only promoted after Phase 7 verification succeeds (Fix B),
# so interrupted upgrades don't leave a misleading "upgraded" marker.
# Also stash the CURRENT VERSION as VERSION.rollback so --rollback and
# print_rollback_help know what release to restore on failure.
write_progress 3 "Code Update" 40 "Applying update..."
mkdir -p "$UPGRADE_DIR"
if [[ -f "$PROJECT_DIR/VERSION" ]]; then
cp "$PROJECT_DIR/VERSION" "$UPGRADE_DIR/VERSION.rollback"
fi
rsync -a --exclude='.env' --exclude='VERSION' "$UPDATE_SRC/" "$PROJECT_DIR/"
cp "$UPDATE_SRC/VERSION" "$UPGRADE_DIR/VERSION.pending"
# Restore user paths
restore_user_paths
# Restore tracked files that may have been overwritten
DELETED_TRACKED="$(git ls-files --deleted 2>/dev/null || true)"
if [[ -n "$DELETED_TRACKED" ]]; then
echo "$DELETED_TRACKED" | xargs git checkout HEAD -- 2>/dev/null || true
fi
rm -rf "$DOWNLOAD_DIR"
success "Updated to ${LATEST_TAG}"
fi
# Skip the git-based update flow below
POST_PULL_COMMIT="$(head -2 "$PROJECT_DIR/VERSION" | tail -1 2>/dev/null || echo "release")"
elif [[ "$DRY_RUN" == "true" ]]; then
info "[DRY RUN] Would fetch and show incoming changes:"
git fetch origin "$BRANCH" 2>/dev/null || true
INCOMING="$(git log --oneline HEAD..origin/"$BRANCH" 2>/dev/null || echo "(unable to preview)")"
if [[ -n "$INCOMING" ]]; then
echo "$INCOMING"
else
info "No new commits to pull."
fi
info "[DRY RUN] Would preserve user-modifiable paths: ${USER_PATHS[*]}"
info "[DRY RUN] Would stash local changes, pull, and pop stash"
release_lock
exit 0
fi
# Source-mode git pull flow. Release mode handles its update via tarball
# download in the block above and skips this entire section.
if [[ "$INSTALL_MODE" == "source" ]]; then
# Step 0: Save user-modifiable paths before any git operations
save_user_paths
# Step 0b: Clear skip-worktree flags that prevent merge (e.g., repo-data JSON files)
SKIP_WORKTREE_FILES="$(git ls-files -v | grep '^S ' | awk '{print $2}' || true)"
if [[ -n "$SKIP_WORKTREE_FILES" ]]; then
info "Clearing skip-worktree flags on $(echo "$SKIP_WORKTREE_FILES" | wc -l | xargs) file(s)..."
echo "$SKIP_WORKTREE_FILES" | xargs git update-index --no-skip-worktree
success "Skip-worktree flags cleared"
fi
# Step 0c: Fix Docker-owned directories that block git checkout
for owned_dir in api/upgrade api/uploads api/configs; do
if [[ -d "$PROJECT_DIR/$owned_dir" ]] && [[ ! -w "$PROJECT_DIR/$owned_dir" ]]; then
info "Fixing permissions on $owned_dir..."
docker run --rm -v "$PROJECT_DIR/$owned_dir:/fix" alpine chown -R "$(id -u):$(id -g)" /fix 2>/dev/null || true
fi
done
# Step 1: Stash user changes if any exist
HAS_CHANGES=false
if [[ -n "$(git status --porcelain 2>/dev/null)" ]]; then
HAS_CHANGES=true
STASH_NAME="upgrade-${TIMESTAMP}"
info "Stashing local changes as '$STASH_NAME'..."
git stash push --include-untracked -m "$STASH_NAME"
success "Local changes stashed"
fi
# Step 3: Pull updates
info "Pulling updates from origin/$BRANCH..."
if ! git pull origin "$BRANCH" --no-edit 2>&1; then
error "git pull failed. This may indicate upstream force-push or branch issues."
if [[ "$HAS_CHANGES" == "true" ]]; then
warn "Your stashed changes can be recovered with: git stash pop"
fi
exit 1
fi
POST_PULL_COMMIT="$(git rev-parse --short HEAD)"
success "Updated to $POST_PULL_COMMIT"
# Step 4: Pop stash and handle conflicts
if [[ "$HAS_CHANGES" == "true" ]]; then
info "Restoring local changes..."
if git stash pop 2>&1; then
success "Local changes restored cleanly"
else
warn "Merge conflicts detected during stash pop"
# Auto-resolve user-modifiable paths by keeping user's version
RESOLVED_COUNT=0
for user_path in "${USER_PATHS[@]}"; do
CONFLICTED="$(git diff --name-only --diff-filter=U -- "$user_path" 2>/dev/null || true)"
if [[ -n "$CONFLICTED" ]]; then
while IFS= read -r cf; do
info " Auto-resolving (keeping yours): $cf"
git checkout --theirs "$cf" 2>/dev/null || true
git add "$cf"
RESOLVED_COUNT=$((RESOLVED_COUNT + 1))
done < <(echo "$CONFLICTED")
fi
done
# Check if any conflicts remain in project-owned files
REMAINING_CONFLICTS="$(git diff --name-only --diff-filter=U 2>/dev/null || true)"
if [[ -n "$REMAINING_CONFLICTS" ]]; then
error "Unresolved conflicts in project-owned files:"
echo "$REMAINING_CONFLICTS" | while read -r f; do echo " $f"; done
echo ""
error "These files have upstream changes that conflict with your edits."
error "Resolve manually, then run the upgrade again."
info "Your pre-upgrade commit: $PRE_UPGRADE_COMMIT"
info "To abort: git merge --abort OR git checkout $PRE_UPGRADE_COMMIT"
exit 1
fi
if [[ $RESOLVED_COUNT -gt 0 ]]; then
success "Auto-resolved $RESOLVED_COUNT user-modifiable path(s) (kept your versions)"
fi
fi
fi
# Step 4b: Restore user-modifiable paths (unconditionally overwrites with saved copies)
restore_user_paths
# Step 4c: Restore any tracked files accidentally deleted by restore_user_paths
# (can happen when save_user_paths can't read root-owned files in user paths)
DELETED_TRACKED="$(git ls-files --deleted 2>/dev/null || true)"
if [[ -n "$DELETED_TRACKED" ]]; then
info "Restoring $(echo "$DELETED_TRACKED" | wc -l | xargs) tracked file(s) deleted during restore..."
echo "$DELETED_TRACKED" | xargs git checkout HEAD -- 2>/dev/null || true
success "Tracked files restored from HEAD"
fi
fi
# End of source-mode git pull flow
# Step 5: Detect new env vars
info "Checking for new environment variables..."
if [[ -f "$PROJECT_DIR/.env.example" ]] && [[ -f "$PROJECT_DIR/.env" ]]; then
NEW_VARS=()
while IFS='=' read -r key value; do
[[ -z "$key" || "$key" =~ ^[[:space:]]*# ]] && continue
key="$(echo "$key" | xargs)"
[[ ! "$key" =~ ^[a-zA-Z_][a-zA-Z0-9_]*$ ]] && continue
if ! grep -q "^${key}=" "$PROJECT_DIR/.env" 2>/dev/null; then
# Strip inline comments and trim whitespace before appending
value="${value%%#*}"
value="$(echo "$value" | xargs)"
echo "${key}=${value}" >> "$PROJECT_DIR/.env"
NEW_VARS+=("$key")
fi
done < "$PROJECT_DIR/.env.example"
if [[ ${#NEW_VARS[@]} -gt 0 ]]; then
warn "New env vars added to .env (review defaults):"
for v in "${NEW_VARS[@]}"; do
echo -e " ${CYAN}$v${NC}"
done
else
success "No new environment variables"
fi
fi
# Step 6: Print update summary (source mode only — release mode has no commit range)
COMMIT_COUNT=0
if [[ "$INSTALL_MODE" == "source" ]]; then
COMMIT_RANGE="${PRE_UPGRADE_SHORT}..${POST_PULL_COMMIT}"
# Use || true and check pipefail-safe to survive git failures
COMMIT_COUNT="$(git log --oneline "$PRE_UPGRADE_COMMIT..HEAD" 2>/dev/null | wc -l | xargs || echo 0)"
echo ""
info "Update summary: $COMMIT_COUNT commit(s) ($COMMIT_RANGE)"
git log --oneline "$PRE_UPGRADE_COMMIT..HEAD" 2>/dev/null | head -20 || true
if [[ "$COMMIT_COUNT" -gt 20 ]]; then
info " ... and $((COMMIT_COUNT - 20)) more"
fi
# Flag commits that may require manual attention
BREAKING_COMMITS="$(git log --oneline "$PRE_UPGRADE_COMMIT..HEAD" --grep="BREAKING" --grep="\[manual\]" 2>/dev/null || true)"
if [[ -n "$BREAKING_COMMITS" ]]; then
echo ""
warn "Commits requiring manual attention:"
echo "$BREAKING_COMMITS" | while read -r line; do
echo -e " ${YELLOW}$line${NC}"
done
fi
else
info "Update summary: ${PRE_UPGRADE_SHORT} → release"
fi
# =============================================================================
# Phase 4: Container Rebuild
# =============================================================================
phase "4" "Container Rebuild"
write_progress 4 "Container Rebuild" 50 "Preparing containers..."
CHANGED_FILES="$(git diff --name-only "$PRE_UPGRADE_COMMIT..HEAD" 2>/dev/null || true)"
if [[ "$USE_REGISTRY" == "true" ]]; then
# --- Registry pull path: pull pre-built production images from Gitea ---
REGISTRY="${GITEA_REGISTRY:-gitea.bnkops.com/admin}"
REGISTRY_TAG="$(git rev-parse --short HEAD 2>/dev/null || echo "latest")"
export GITEA_REGISTRY="$REGISTRY"
export IMAGE_TAG="$REGISTRY_TAG"
export BUILD_TARGET=production
info "Registry mode: ${REGISTRY} (tag: ${REGISTRY_TAG})"
write_progress 4 "Container Rebuild" 55 "Pulling images from registry..."
# Pull core app containers: try SHA tag → :latest fallback → source build
# NOTE: stderr intentionally flows through so slow/broken pulls are visible
# in logs/upgrade-watcher.log. Previously silenced, which left the v2.9.3
# systemd-killed upgrade with zero error trace.
PULLED_TAG=""
if docker compose pull api admin media-api; then
success "Core images pulled from registry (tag: ${REGISTRY_TAG})"
PULLED_TAG="$REGISTRY_TAG"
elif [[ "$REGISTRY_TAG" != "latest" ]]; then
warn "Tag :${REGISTRY_TAG} not in registry — trying :latest"
export IMAGE_TAG="latest"
if docker compose pull api admin media-api; then
success "Core images pulled from registry (tag: latest)"
PULLED_TAG="latest"
# Retag :latest as :SHA so compose up uses consistent tags
for svc in api admin media-api; do
local_img="${REGISTRY}/changemaker-${svc}"
docker tag "${local_img}:latest" "${local_img}:${REGISTRY_TAG}" 2>/dev/null || true
done
export IMAGE_TAG="$REGISTRY_TAG"
else
warn "Registry pull failed for :latest too — falling back to source build"
export IMAGE_TAG="$REGISTRY_TAG"
docker compose build $SOURCE_CONTAINERS
success "Source containers rebuilt (registry fallback)"
fi
else
warn "Registry pull failed — falling back to source build"
docker compose build $SOURCE_CONTAINERS
success "Source containers rebuilt (registry fallback)"
fi
# nginx: try SHA → :latest → rebuild if config changed
NGINX_PULLED=false
if docker compose pull nginx; then
success "nginx pulled from registry (tag: ${IMAGE_TAG})"
NGINX_PULLED=true
elif [[ "$REGISTRY_TAG" != "latest" ]]; then
export IMAGE_TAG="latest"
if docker compose pull nginx; then
docker tag "${REGISTRY}/changemaker-nginx:latest" "${REGISTRY}/changemaker-nginx:${REGISTRY_TAG}" 2>/dev/null || true
success "nginx pulled from registry (tag: latest)"
NGINX_PULLED=true
fi
export IMAGE_TAG="$REGISTRY_TAG"
fi
if [[ "$NGINX_PULLED" == "false" ]]; then
if echo "$CHANGED_FILES" | grep -q "^nginx/"; then
info "Rebuilding nginx (config changed, not in registry)..."
docker compose build nginx
success "nginx rebuilt"
else
info "nginx unchanged, skipping rebuild"
fi
fi
else
# --- Source build path (original behaviour) ---
info "Rebuilding source containers: $SOURCE_CONTAINERS"
docker compose build $SOURCE_CONTAINERS
success "Source containers rebuilt"
# Conditionally rebuild containers whose Dockerfiles changed
for svc in $CONDITIONAL_CONTAINERS; do
case "$svc" in
nginx)
if echo "$CHANGED_FILES" | grep -q "^nginx/"; then
info "Rebuilding nginx (config changed)..."
docker compose build nginx
success "nginx rebuilt"
else
info "nginx unchanged, skipping rebuild"
fi
;;
esac
done
fi
# Optionally pull third-party images
if [[ "$PULL_SERVICES" == "true" ]]; then
info "Pulling latest third-party images..."
docker compose pull v2-postgres redis listmonk-app listmonk-db gitea-app nocodb-v2 mailhog || true
success "Third-party images updated"
# Record image digests for audit trail
info "Recording image digests for audit trail..."
docker compose images --format json 2>/dev/null | \
python3 -c "import sys,json; [print(f' {i[\"Repository\"]}:{i[\"Tag\"]} -> {i[\"ID\"][:12]}') for i in json.load(sys.stdin)]" \
2>/dev/null || true
fi
# =============================================================================
# Phase 5: Database Migration
# =============================================================================
phase "5" "Database Migration"
write_progress 5 "Database Migration" 55 "Checking database state..."
# Ensure infrastructure is running and healthy
info "Ensuring infrastructure is up..."
docker compose up -d $INFRA_CONTAINERS
# Wait for PostgreSQL to be ready
info "Waiting for PostgreSQL..."
PG_WAIT=0
PG_TIMEOUT=60
while ! docker compose exec -T v2-postgres pg_isready -U "${V2_POSTGRES_USER:-changemaker}" &>/dev/null 2>&1; do
sleep 2
PG_WAIT=$((PG_WAIT + 2))
if [[ $PG_WAIT -ge $PG_TIMEOUT ]]; then
error "PostgreSQL did not become ready within ${PG_TIMEOUT}s"
exit 1
fi
done
success "PostgreSQL ready (${PG_WAIT}s)"
# Check for failed/incomplete migrations
info "Checking for failed migrations..."
FAILED_MIGRATIONS="$(docker compose exec -T v2-postgres psql -U "${V2_POSTGRES_USER:-changemaker}" -d "${V2_POSTGRES_DB:-changemaker_v2}" -t -A -c "
SELECT migration_name FROM _prisma_migrations
WHERE rolled_back_at IS NOT NULL
OR (finished_at IS NULL AND started_at IS NOT NULL
AND started_at < NOW() - INTERVAL '10 minutes')
" 2>/dev/null || true)"
if [[ -n "$FAILED_MIGRATIONS" ]]; then
warn "Found failed/incomplete migrations — auto-resolving..."
while IFS= read -r migration_name; do
[[ -z "$migration_name" ]] && continue
info " Resolving: $migration_name"
docker compose run --rm --no-deps --entrypoint "" api \
npx prisma migrate resolve --applied "$migration_name" 2>&1 || {
warn " Could not auto-resolve $migration_name (may need manual intervention)"
}
done <<< "$FAILED_MIGRATIONS"
success "Failed migrations resolved"
else
success "No failed migrations found"
fi
# Preview pending migrations before applying
info "Checking pending migrations..."
PENDING_OUTPUT="$(docker compose run --rm --no-deps --entrypoint "" api \
npx prisma migrate status 2>&1 || true)"
if echo "$PENDING_OUTPUT" | grep -q "Following migration"; then
info "Pending migrations to apply:"
echo "$PENDING_OUTPUT" | grep -E "^\s+[0-9]" | while read -r line; do
echo " $line"
done
fi
# Run migrations in a one-off container (catches errors here, not in a restart loop)
info "Running database migrations..."
write_progress 5 "Database Migration" 60 "Applying migrations..."
if ! docker compose run --rm --no-deps --entrypoint "" api \
npx prisma migrate deploy 2>&1; then
error "Database migration failed!"
error ""
error "Common fixes:"
error " 1. Check migration status:"
error " docker compose exec v2-postgres psql -U changemaker -d changemaker_v2 \\"
error " -c \"SELECT migration_name, finished_at, rolled_back_at FROM _prisma_migrations ORDER BY started_at DESC LIMIT 10;\""
error " 2. Mark a stuck migration as applied:"
error " docker compose run --rm --no-deps --entrypoint '' api npx prisma migrate resolve --applied <migration_name>"
error " 3. Check logs: docker compose logs api --tail 50"
error ""
error "After fixing, re-run: ./scripts/upgrade.sh --force --skip-backup"
exit 1
fi
# Count applied migrations
MIGRATION_COUNT="$(docker compose exec -T v2-postgres psql -U "${V2_POSTGRES_USER:-changemaker}" -d "${V2_POSTGRES_DB:-changemaker_v2}" -t -A -c "
SELECT COUNT(*) FROM _prisma_migrations WHERE finished_at IS NOT NULL
" 2>/dev/null || echo "?")"
success "Migrations up to date ($MIGRATION_COUNT total applied)"
# Run database seed (idempotent)
info "Running database seed..."
write_progress 5 "Database Migration" 65 "Seeding database..."
if ! docker compose run --rm --no-deps --entrypoint "" api \
npx prisma db seed 2>&1; then
warn "Database seed had warnings (non-fatal, continuing)"
fi
success "Database seed complete"
# Verify migration state is clean (no drift)
info "Verifying migration state..."
MIGRATE_STATUS="$(docker compose run --rm --no-deps --entrypoint "" api \
npx prisma migrate status 2>&1 || true)"
if echo "$MIGRATE_STATUS" | grep -qiE "failed|drift|out of sync"; then
error "Schema drift detected after migration!"
echo "$MIGRATE_STATUS"
exit 1
fi
success "Schema state verified — no drift"
# =============================================================================
# Phase 6: Service Restart
# =============================================================================
phase "6" "Service Restart"
write_progress 6 "Service Restart" 70 "Restarting services..."
# Graceful shutdown with extended drain period (allow in-flight requests to complete)
STOP_TIMEOUT=30
info "Stopping application containers (${STOP_TIMEOUT}s grace period)..."
docker compose stop -t $STOP_TIMEOUT $APP_CONTAINERS 2>/dev/null || true
success "Application containers stopped"
# Force-recreate LSIO containers to prevent anonymous volume shadowing bind mounts.
# LSIO images define a VOLUME at /config in their Dockerfile. When a container is
# merely restarted, Docker reuses the old anonymous volume whose /config/www is empty,
# which shadows the bind mount (e.g., ./mkdocs/site:/config/www → 403 Forbidden).
# Removing the container first ensures a fresh anonymous volume that respects bind mounts.
info "Removing LSIO containers (clearing anonymous volumes)..."
docker compose rm -sf $LSIO_VOLUME_CONTAINERS 2>/dev/null || true
success "LSIO containers cleared for fresh recreation"
# Verify Gancio config.json exists before starting services
verify_gancio_config
# Detect if npm dependencies changed (stale anonymous volumes cause missing modules)
NEEDS_VOLUME_REFRESH=false
if echo "$CHANGED_FILES" | grep -qE "^(api|admin)/(package\.json|package-lock\.json)"; then
NEEDS_VOLUME_REFRESH=true
warn "Package dependencies changed — will recreate containers with fresh volumes"
fi
# Start API (migrations already applied in Phase 5)
info "Starting API..."
if [[ "$NEEDS_VOLUME_REFRESH" == "true" ]]; then
info "Removing old API/admin containers (clearing stale node_modules volumes)..."
docker compose rm -sfv api admin 2>/dev/null || true
fi
docker compose up -d api
# Poll API health check
info "Waiting for API health check..."
API_WAIT=0
while true; do
if docker compose exec -T api wget -q --spider http://localhost:4000/api/health 2>/dev/null; then
break
fi
# Detect container crash early (don't wait full timeout)
if ! docker compose ps api --format '{{.State}}' 2>/dev/null | grep -q "running"; then
error "API container exited unexpectedly"
docker compose logs api --tail 20
exit 1
fi
sleep $HEALTH_INTERVAL
API_WAIT=$((API_WAIT + HEALTH_INTERVAL))
if [[ $API_WAIT -ge $HEALTH_TIMEOUT ]]; then
error "API did not become healthy within ${HEALTH_TIMEOUT}s"
error "Check logs: docker compose logs api --tail 50"
exit 1
fi
done
success "API healthy (${API_WAIT}s)"
# Start everything else (exclude one-shot init containers)
info "Starting remaining services..."
docker compose up -d \
--scale listmonk-init=0 \
--scale gancio-init=0 \
--scale vaultwarden-init=0
success "All services started"
# Restart Pangolin tunnel connector if running (may hold stale state after nginx rebuild)
if docker ps --format '{{.Names}}' | grep -q 'newt'; then
info "Restarting Pangolin tunnel connector..."
docker compose restart newt 2>/dev/null || true
success "Newt tunnel restarted"
fi
# Restart monitoring if it was running before
if [[ "$MONITORING_WAS_RUNNING" == "true" ]]; then
info "Restarting monitoring stack..."
if docker compose --profile monitoring up -d 2>&1; then
success "Monitoring stack restarted"
else
warn "Monitoring stack restart had errors (non-fatal, services may already be running)"
fi
fi
# =============================================================================
# Phase 7: Post-Upgrade Verification
# =============================================================================
phase "7" "Post-Upgrade Verification"
write_progress 7 "Verification" 90 "Running health checks..."
VERIFY_FAILED=false
# Polling health check helper (retries for up to MAX_WAIT seconds)
verify_service_health() {
local name="$1" check_cmd="$2" max_wait="${3:-30}"
local waited=0
while [[ $waited -lt $max_wait ]]; do
if eval "$check_cmd" 2>/dev/null; then
success "$name: healthy (${waited}s)"
return 0
fi
sleep 3
waited=$((waited + 3))
done
warn "$name: not responding after ${max_wait}s"
VERIFY_FAILED=true
# Always return 0 — under set -e a non-zero return from this helper would
# exit the script before write_result runs. The VERIFY_FAILED flag is the
# signal the caller actually checks.
return 0
}
# API health (with polling — may still be running migrations)
verify_service_health "API (port 4000)" \
"docker compose exec -T api wget -q --spider http://localhost:4000/api/health" 45
# Admin health — 90s matches the admin container's start_period + a cushion
# for first-boot Vite bundling. 30s was aspirational and produced cry-wolf
# warnings on every successful upgrade.
verify_service_health "Admin (port 3000)" \
"docker compose exec -T admin wget -q --spider http://localhost:3000/" 90
# Media API health (optional — may not be enabled)
if docker ps --format '{{.Names}}' | grep -q 'changemaker-media-api'; then
verify_service_health "Media API (port 4100)" \
"docker compose exec -T media-api wget -q --spider http://127.0.0.1:4100/health" 30
fi
# Gancio health (optional) — restart loop is still a hard signal, but
# "starting" now gets retry grace instead of passing silently.
if docker ps --format '{{.Names}}' | grep -q 'gancio-changemaker'; then
if docker compose ps gancio --format '{{.Status}}' 2>/dev/null | grep -qi "restarting"; then
warn "Gancio: restart loop detected (check config.json in gancio-data volume)"
VERIFY_FAILED=true
else
verify_service_health "Gancio" \
"docker compose ps gancio --format '{{.Status}}' 2>/dev/null | grep -q healthy" 60
fi
fi
# MkDocs static site health (retry — first-boot rebuild can lag)
if docker ps --format '{{.Names}}' | grep -q 'mkdocs-site-server'; then
verify_service_health "MkDocs site (port ${MKDOCS_SITE_SERVER_PORT:-4004})" \
"curl -sf http://localhost:${MKDOCS_SITE_SERVER_PORT:-4004}/ -o /dev/null" 30
fi
# Check for containers in restart loop
RESTARTING="$(docker compose ps 2>/dev/null | grep -i "restarting" || true)"
if [[ -n "$RESTARTING" ]]; then
warn "Containers in restart loop:"
echo "$RESTARTING"
VERIFY_FAILED=true
fi
if [[ "$VERIFY_FAILED" == "true" ]]; then
warn "Some health checks failed. Services may still be starting."
info "Check logs: docker compose logs --tail 50"
UPGRADE_WARNINGS='["Some health checks failed after upgrade — services may still be starting"]'
else
success "All health checks passed"
fi
# --- External reachability probe (Fix C) ---
# Non-fatal: a Pangolin resource misassignment or DNS flap wouldn't be
# caught by the localhost-only checks above. Warn (don't fail) because
# transient tunnel issues should not roll back a successful upgrade.
if [[ -n "${DOMAIN:-}" ]] && command -v curl >/dev/null 2>&1; then
info "Probing external API at https://api.${DOMAIN}/api/health ..."
EXT_CODE="$(curl -s -o /dev/null -w '%{http_code}' --max-time 10 \
"https://api.${DOMAIN}/api/health" 2>/dev/null || echo "000")"
if [[ "$EXT_CODE" == "200" ]]; then
success "External API reachable (HTTP 200)"
else
warn "External API probe returned HTTP ${EXT_CODE} — check Pangolin tunnel"
UPGRADE_WARNINGS="$(python3 - "$UPGRADE_WARNINGS" "$DOMAIN" "$EXT_CODE" <<'PYEOF' 2>/dev/null || echo "$UPGRADE_WARNINGS"
import json, sys
try:
w = json.loads(sys.argv[1]) if sys.argv[1] else []
except Exception:
w = []
if not isinstance(w, list):
w = []
w.append(f"External API https://api.{sys.argv[2]}/api/health returned HTTP {sys.argv[3]}")
print(json.dumps(w))
PYEOF
)"
fi
fi
# --- Atomic VERSION promotion (Fix B) ---
# The staged VERSION from Phase 3 lands now that we've reached the end of
# Phase 7 without on_failure firing. Promote regardless of VERIFY_FAILED —
# that flag is a soft health-check warning (e.g. "admin slow to respond"),
# not an upgrade failure. The tarball is extracted, containers are up, and
# write_result below will record success=true. Gating promotion on
# VERIFY_FAILED previously caused a "stuck at old VERSION" bug where a
# transient admin healthcheck warning pinned the install back.
# Hard failures (SIGTERM, exit !=0) still prevent promotion via on_failure,
# which rm -f's VERSION.pending before it can be promoted.
if [[ -f "$UPGRADE_DIR/VERSION.pending" ]]; then
mv "$UPGRADE_DIR/VERSION.pending" "$PROJECT_DIR/VERSION"
success "VERSION promoted to $(head -1 "$PROJECT_DIR/VERSION" 2>/dev/null || echo "?")"
fi
# =============================================================================
# Summary
# =============================================================================
ELAPSED="$(elapsed)"
if [[ "$INSTALL_MODE" == "source" ]]; then
FINAL_COMMIT="$(git rev-parse --short HEAD)"
else
FINAL_COMMIT="$(head -1 "$PROJECT_DIR/VERSION" 2>/dev/null || echo "release")"
fi
write_progress 7 "Verification" 100 "Upgrade complete!"
write_result "true" "Upgraded ${PRE_UPGRADE_SHORT}${FINAL_COMMIT} (${COMMIT_COUNT} commits)" "$UPGRADE_WARNINGS"
archive_success_to_history "Upgraded ${PRE_UPGRADE_SHORT}${FINAL_COMMIT} (${COMMIT_COUNT} commits)"
echo ""
echo -e "${BOLD}${GREEN}══════════════════════════════════════════════════${NC}"
echo -e "${BOLD}${GREEN} Upgrade Complete${NC}"
echo -e "${BOLD}${GREEN}══════════════════════════════════════════════════${NC}"
echo ""
echo -e " ${BOLD}Previous:${NC} $PRE_UPGRADE_SHORT"
if [[ "$INSTALL_MODE" == "source" ]]; then
echo -e " ${BOLD}Current:${NC} $FINAL_COMMIT ($(git log -1 --format='%s' HEAD 2>/dev/null || echo "$FINAL_COMMIT"))"
else
echo -e " ${BOLD}Current:${NC} $FINAL_COMMIT"
fi
echo -e " ${BOLD}Commits:${NC} $COMMIT_COUNT"
echo -e " ${BOLD}Duration:${NC} $ELAPSED"
echo -e " ${BOLD}Log:${NC} $LOG_FILE"
echo ""
release_lock
trap - EXIT