bunker-admin a82e95946b fix(gancio): pre-start config-init sidecar prevents restart loop
Gancio refuses to start when its DB has tables but the data volume has no
config.json ("Non empty db! Please move your current db elsewhere than retry"),
which produces an infinite restart loop. This hit production tenants bnkops
and trbh (>1200 restart cycles each) — proximate cause was a missing
config.json in changemakerlite_gancio-data with the DB fully populated.

Add gancio-config-init alpine sidecar that runs on every `up`:
  - no-op when config.json exists
  - regenerates from .env when missing (1000:1000 ownership)
  - gancio service now depends on its service_completed_successfully

Also harden verify_gancio_config in upgrade.sh to error loudly when
multiple gancio-data volumes match (silent head -1 could pick the wrong
one after a compose project rename).
2026-05-19 17:02:55 -06:00

1467 lines
56 KiB
Bash
Executable File

#!/usr/bin/env bash
# =============================================================================
# Changemaker Lite V2 — Upgrade Script
# Safely pulls updates, rebuilds containers, and restarts services.
# Usage: ./scripts/upgrade.sh [OPTIONS]
# =============================================================================
set -euo pipefail
# --- Configuration ---
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
PROJECT_DIR="$(dirname "$SCRIPT_DIR")"
TIMESTAMP="$(date +%Y%m%d_%H%M%S)"
LOG_DIR="${PROJECT_DIR}/logs"
LOG_FILE="${LOG_DIR}/upgrade-${TIMESTAMP}.log"
LOCK_FILE="${PROJECT_DIR}/.upgrade.lock"
BACKUP_DIR="${BACKUP_DIR:-$PROJECT_DIR/backups}"
HEALTH_TIMEOUT=120
HEALTH_INTERVAL=5
MIN_DISK_MB=2048
# Tracks which phase the upgrade is currently in, so on_failure can report
# "killed during Phase 4: Container Rebuild" instead of just an exit code.
CURRENT_PHASE_NAME=""
# Warnings accumulated during the run — surfaced in result.json. Global so
# Phase 7 probes (external reachability) can append without losing earlier
# entries set in the Phase 7 verification block.
UPGRADE_WARNINGS="[]"
# Source-built containers (always rebuilt)
SOURCE_CONTAINERS="api admin media-api"
# Conditionally rebuilt if Dockerfile changed
CONDITIONAL_CONTAINERS="nginx"
# App containers stopped during upgrade
APP_CONTAINERS="api admin media-api nginx"
# Infrastructure containers (must stay up)
INFRA_CONTAINERS="v2-postgres redis"
# LSIO containers with anonymous /config volumes (must be force-recreated on upgrade
# to prevent stale anonymous volumes from shadowing bind mounts underneath /config)
LSIO_VOLUME_CONTAINERS="mkdocs-site-server"
# User-modifiable paths (auto-resolve keeps user version on conflict)
USER_PATHS=(
"mkdocs/docs/"
"mkdocs/mkdocs.yml"
"mkdocs/site/"
"configs/"
"nginx/conf.d/services.conf"
)
# --- Detect install mode ---
if [[ -f "$PROJECT_DIR/VERSION" ]] && [[ ! -d "$PROJECT_DIR/.git" ]]; then
INSTALL_MODE="release"
else
INSTALL_MODE="source"
fi
# --- Defaults ---
SKIP_BACKUP=false
PULL_SERVICES=false
DRY_RUN=false
FORCE=false
BRANCH=""
ROLLBACK=false
API_MODE=false
USE_REGISTRY=false
# Release installs always use registry mode
if [[ "$INSTALL_MODE" == "release" ]]; then
USE_REGISTRY=true
fi
# --- Colors (respects NO_COLOR convention) ---
if [[ -t 1 ]] && [[ -z "${NO_COLOR:-}" ]]; then
RED='\033[0;31m' GREEN='\033[0;32m' YELLOW='\033[0;33m'
BLUE='\033[0;34m' CYAN='\033[0;36m' BOLD='\033[1m'
DIM='\033[2m' NC='\033[0m'
else
RED='' GREEN='' YELLOW='' BLUE='' CYAN='' BOLD='' DIM='' NC=''
fi
# =============================================================================
# Utility Functions
# =============================================================================
info() { echo -e "${CYAN}[INFO]${NC} $*"; }
success() { echo -e "${GREEN}[ OK ]${NC} $*"; }
warn() { echo -e "${YELLOW}[WARN]${NC} $*"; }
error() { echo -e "${RED}[ERR ]${NC} $*" >&2; }
phase() {
echo ""
echo -e "${BOLD}${BLUE}═══════════════════════════════════════════════${NC}"
echo -e "${BOLD}${BLUE} Phase $1: $2${NC}"
echo -e "${BOLD}${BLUE}═══════════════════════════════════════════════${NC}"
echo ""
}
# --- API mode: JSON progress/result writing ---
UPGRADE_DIR="${PROJECT_DIR}/data/upgrade"
PROGRESS_FILE="${UPGRADE_DIR}/progress.json"
RESULT_FILE="${UPGRADE_DIR}/result.json"
write_progress() {
local phase_num="$1" phase_name="$2" pct="$3" msg="$4"
# Track phase name for on_failure regardless of API_MODE — useful for logs too.
CURRENT_PHASE_NAME="$phase_name"
[[ "$API_MODE" != "true" ]] && return
mkdir -p "$UPGRADE_DIR"
cat > "$PROGRESS_FILE" <<PEOF
{
"phase": ${phase_num},
"phaseName": "${phase_name}",
"percentage": ${pct},
"message": "$(echo "$msg" | sed 's/"/\\"/g')",
"lastUpdate": "$(date -u +%Y-%m-%dT%H:%M:%SZ)"
}
PEOF
}
write_result() {
[[ "$API_MODE" != "true" ]] && return
local success="$1" msg="$2"
local duration_secs=$((SECONDS - START_TIME))
local warnings_json="${3:-[]}"
mkdir -p "$UPGRADE_DIR"
cat > "$RESULT_FILE" <<REOF
{
"success": ${success},
"message": "$(echo "$msg" | sed 's/"/\\"/g')",
"previousCommit": "${PRE_UPGRADE_SHORT:-unknown}",
"newCommit": "$(head -1 "$PROJECT_DIR/VERSION" 2>/dev/null || git rev-parse --short HEAD 2>/dev/null || echo "unknown")",
"commitCount": ${COMMIT_COUNT:-0},
"durationSeconds": ${duration_secs},
"warnings": ${warnings_json},
"completedAt": "$(date -u +%Y-%m-%dT%H:%M:%SZ)"
}
REOF
# Clean up progress file
rm -f "$PROGRESS_FILE"
# Update status.json with new commit info
if [[ -x "$SCRIPT_DIR/upgrade-check.sh" ]]; then
"$SCRIPT_DIR/upgrade-check.sh" 2>/dev/null || true
fi
}
elapsed() {
local secs=$((SECONDS - START_TIME))
printf '%dm %ds' $((secs / 60)) $((secs % 60))
}
# --- Save/restore user-modifiable paths across git pull ---
save_user_paths() {
USER_SAVE_DIR="$(mktemp -d)"
for p in "${USER_PATHS[@]}"; do
if [[ -e "$PROJECT_DIR/$p" ]]; then
mkdir -p "$USER_SAVE_DIR/$(dirname "$p")"
cp -a "$PROJECT_DIR/$p" "$USER_SAVE_DIR/$p"
fi
done
}
restore_user_paths() {
if [[ -z "${USER_SAVE_DIR:-}" ]] || [[ ! -d "${USER_SAVE_DIR:-}" ]]; then
return
fi
local restored=0
for p in "${USER_PATHS[@]}"; do
if [[ -e "$USER_SAVE_DIR/$p" ]]; then
# Ensure parent directory exists (in case pull deleted it)
mkdir -p "$PROJECT_DIR/$(dirname "$p")"
# Use docker alpine to remove if regular rm fails (root-owned files from containers)
rm -rf "$PROJECT_DIR/$p" 2>/dev/null || \
docker run --rm -v "$PROJECT_DIR:/project" alpine rm -rf "/project/$p" 2>/dev/null || true
cp -a "$USER_SAVE_DIR/$p" "$PROJECT_DIR/$p"
restored=$((restored + 1))
fi
done
rm -rf "$USER_SAVE_DIR"
if [[ $restored -gt 0 ]]; then
success "Restored $restored user-modifiable path(s)"
fi
}
# --- Verify Gancio config.json in its data volume ---
# Gancio uses a named Docker volume for /home/node/data. If the volume loses
# config.json (e.g., volume name prefix mismatch after compose project rename),
# Gancio detects an existing DB but no config and refuses to start with:
# "Non empty db! Please move your current db elsewhere than retry."
# This regenerates config.json from .env vars when missing.
verify_gancio_config() {
# Note: as of the gancio-config-init sidecar in docker-compose{,prod}.yml,
# config.json is regenerated automatically on every `up`. This function is
# kept as belt-and-braces for the upgrade flow specifically (e.g. so the
# check happens before the compose-up rather than at compose-up time, and
# so operators see explicit log output during upgrade).
local matches
matches="$(docker volume ls --format '{{.Name}}' | grep 'gancio-data' || true)"
local count
count=$(printf '%s\n' "$matches" | grep -c '.' || true)
if [[ "$count" -eq 0 ]]; then
return # No gancio volume exists yet; first run will handle it
fi
if [[ "$count" -gt 1 ]]; then
error "Multiple gancio-data volumes found — refusing to guess. Resolve manually:\n$matches"
fi
local gancio_volume="$matches"
# Check if config.json exists and is non-empty
if docker run --rm -v "${gancio_volume}:/data" alpine test -s /data/config.json 2>/dev/null; then
success "Gancio config.json present in $gancio_volume"
return
fi
warn "Gancio config.json missing in volume $gancio_volume — regenerating from .env"
local base_url="${GANCIO_BASE_URL:-https://events.cmlite.org}"
local pg_user="${V2_POSTGRES_USER:-changemaker}"
local pg_pass="${V2_POSTGRES_PASSWORD:-changemaker}"
local config_json="{\"baseurl\":\"${base_url}\",\"server\":{\"host\":\"0.0.0.0\",\"port\":13120},\"db\":{\"dialect\":\"postgres\",\"host\":\"changemaker-v2-postgres\",\"port\":5432,\"database\":\"gancio\",\"username\":\"${pg_user}\",\"password\":\"${pg_pass}\"}}"
docker run --rm -v "${gancio_volume}:/data" alpine sh -c \
"echo '${config_json}' > /data/config.json && chown 1000:1000 /data/config.json"
success "Gancio config.json regenerated"
}
# --- Lockfile ---
acquire_lock() {
if [[ -f "$LOCK_FILE" ]]; then
local old_pid
old_pid=$(cat "$LOCK_FILE" 2>/dev/null || echo "")
if [[ -n "$old_pid" ]] && kill -0 "$old_pid" 2>/dev/null; then
error "Another upgrade is running (PID $old_pid). If stale, remove $LOCK_FILE"
exit 1
fi
warn "Removing stale lock file (PID $old_pid no longer running)"
rm -f "$LOCK_FILE"
fi
echo $$ > "$LOCK_FILE"
}
release_lock() {
rm -f "$LOCK_FILE"
}
# --- .env loading (from backup.sh — handles special chars) ---
load_env() {
if [[ -f "$PROJECT_DIR/.env" ]]; then
while IFS='=' read -r key value; do
[[ -z "$key" || "$key" =~ ^[[:space:]]*# ]] && continue
key="$(echo "$key" | xargs)"
value="${value%\"}"
value="${value#\"}"
value="${value%\'}"
value="${value#\'}"
if [[ "$key" =~ ^[a-zA-Z_][a-zA-Z0-9_]*$ ]]; then
export "$key=$value"
fi
done < "$PROJECT_DIR/.env"
fi
}
# --- Print rollback instructions ---
print_rollback_help() {
local commit="${PRE_UPGRADE_COMMIT:-unknown}"
echo ""
echo -e "${BOLD}${RED}═══════════════════════════════════════════════${NC}"
echo -e "${BOLD}${RED} Upgrade Failed — Rollback Instructions${NC}"
echo -e "${BOLD}${RED}═══════════════════════════════════════════════${NC}"
echo ""
if [[ "$INSTALL_MODE" == "release" ]]; then
# Release installs have no .git — rollback is "re-download the prior tarball".
# VERSION.rollback is seeded at the start of Phase 3 so we always know what
# tag to go back to, across multiple failed attempts.
local prior
prior="$(cat "${UPGRADE_DIR}/VERSION.rollback" 2>/dev/null | head -1 || echo "vX.Y.Z")"
echo -e " ${BOLD}1.${NC} Restore prior release tarball (${BOLD}${prior}${NC}):"
echo -e " ${CYAN}cd $PROJECT_DIR${NC}"
echo -e " ${CYAN}URL=https://gitea.bnkops.com/admin/changemaker.lite/releases/download/${prior}/changemaker-lite-${prior}.tar.gz${NC}"
echo -e " ${CYAN}curl -fSL \"\$URL\" -o /tmp/rb.tar.gz && tar xzf /tmp/rb.tar.gz --strip-components=1 -C $PROJECT_DIR${NC}"
echo ""
echo -e " ${BOLD}2.${NC} Pull prior images and restart:"
echo -e " ${CYAN}docker compose pull api admin media-api nginx${NC}"
echo -e " ${CYAN}docker compose up -d${NC}"
else
echo -e " ${BOLD}1.${NC} Restore code to pre-upgrade commit:"
echo -e " ${CYAN}cd $PROJECT_DIR${NC}"
echo -e " ${CYAN}git checkout $commit${NC}"
echo ""
echo -e " ${BOLD}2.${NC} Rebuild and restart:"
echo -e " ${CYAN}docker compose build api admin media-api${NC}"
echo -e " ${CYAN}docker compose up -d${NC}"
fi
echo ""
echo -e " ${BOLD}3.${NC} If database rollback is needed (destructive!):"
echo -e " ${CYAN}# Find backup archive:${NC}"
echo -e " ${CYAN}ls -lt $BACKUP_DIR/changemaker-v2-backup-*.tar.gz | head -5${NC}"
echo -e " ${CYAN}# Extract and restore:${NC}"
echo -e " ${CYAN}tar xzf <backup>.tar.gz -C /tmp${NC}"
echo -e " ${CYAN}gunzip -c /tmp/<backup>/v2-postgres.sql.gz | docker exec -i changemaker-v2-postgres psql -U changemaker -d changemaker_v2${NC}"
echo ""
echo -e " Or use: ${CYAN}./scripts/upgrade.sh --rollback${NC}"
echo ""
}
# --- Failure trap ---
# Fires on non-zero exit OR explicit SIGTERM/SIGINT. Writes a truthful
# failure result, discards the staged VERSION (Fix B), clears progress so
# the admin UI stops showing a frozen phase, and archives to history so
# the failure is retrievable.
on_failure() {
local exit_code=$?
# Clean up user path save directory if it exists
if [[ -n "${USER_SAVE_DIR:-}" ]] && [[ -d "${USER_SAVE_DIR:-}" ]]; then
rm -rf "$USER_SAVE_DIR"
fi
# Discard staged VERSION — the bump must only happen after full success.
rm -f "${UPGRADE_DIR}/VERSION.pending" 2>/dev/null || true
release_lock
if [[ $exit_code -ne 0 ]] && [[ "$DRY_RUN" != "true" ]]; then
local phase_tag="${CURRENT_PHASE_NAME:-unknown phase}"
local fail_msg="Upgrade failed during ${phase_tag} at line ${BASH_LINENO[0]} (exit ${exit_code})"
error "$fail_msg"
# Always write the failure result — previously gated behind API_MODE,
# which meant SIGTERM during a watcher-triggered upgrade left stale
# success data in result.json.
write_result_force "false" "$fail_msg"
# Clear progress so the admin UI doesn't show a phantom in-progress phase.
rm -f "$PROGRESS_FILE" 2>/dev/null || true
# Append to history so the failure is discoverable later.
archive_failure_to_history "$fail_msg"
print_rollback_help
info "Log file: $LOG_FILE"
fi
}
# Same as write_result but bypasses the API_MODE guard. Used by on_failure
# to ensure a failure record always lands, even in non-API-mode runs.
write_result_force() {
local success="$1" msg="$2"
local duration_secs=$((SECONDS - ${START_TIME:-SECONDS}))
mkdir -p "$UPGRADE_DIR"
cat > "$RESULT_FILE" <<REOF
{
"success": ${success},
"message": "$(echo "$msg" | sed 's/"/\\"/g')",
"previousCommit": "${PRE_UPGRADE_SHORT:-unknown}",
"newCommit": "$(head -1 "$PROJECT_DIR/VERSION" 2>/dev/null || git rev-parse --short HEAD 2>/dev/null || echo "unknown")",
"commitCount": ${COMMIT_COUNT:-0},
"durationSeconds": ${duration_secs},
"warnings": ${UPGRADE_WARNINGS:-[]},
"completedAt": "$(date -u +%Y-%m-%dT%H:%M:%SZ)"
}
REOF
}
# Append a failure record to history.json (newest first, capped at 50 entries
# to match MAX_HISTORY_ENTRIES in api/src/modules/upgrade/upgrade.service.ts).
archive_failure_to_history() {
_archive_to_history "false" "$1" "[]"
}
# Mirror for success path — prior code relied on the API's handlePostRestartResult
# to archive, which only fires for auto-upgrade post-restart. Admin-UI-triggered
# successes were leaking if the user dismissed the result card before the API
# polled. API-side archiveResult dedupes on completedAt, so double-append is safe.
archive_success_to_history() {
_archive_to_history "true" "$1" "${UPGRADE_WARNINGS:-[]}"
}
_archive_to_history() {
local success="$1" msg="$2" warnings_json="$3"
local hist="${UPGRADE_DIR}/history.json"
mkdir -p "$UPGRADE_DIR"
local entry
entry="$(cat <<HEOF
{"success":${success},"message":"$(echo "$msg" | sed 's/"/\\"/g')","previousCommit":"${PRE_UPGRADE_SHORT:-unknown}","newCommit":"$(head -1 "$PROJECT_DIR/VERSION" 2>/dev/null || echo "unknown")","commitCount":${COMMIT_COUNT:-0},"durationSeconds":$((SECONDS - ${START_TIME:-SECONDS})),"warnings":${warnings_json},"completedAt":"$(date -u +%Y-%m-%dT%H:%M:%SZ)"}
HEOF
)"
python3 - "$hist" "$entry" <<'PYEOF' 2>/dev/null || true
import json, sys
hist_path, entry_json = sys.argv[1], sys.argv[2]
try:
with open(hist_path) as f:
history = json.load(f)
if not isinstance(history, list):
history = []
except Exception:
history = []
history.insert(0, json.loads(entry_json))
history = history[:50]
with open(hist_path, 'w') as f:
json.dump(history, f, indent=2)
PYEOF
}
# =============================================================================
# Parse Arguments
# =============================================================================
show_help() {
cat << 'EOF'
Changemaker Lite V2 — Upgrade Script
Usage: ./scripts/upgrade.sh [OPTIONS]
Options:
--skip-backup Skip backup phase (requires --force)
--pull-services Also pull new third-party Docker images
--use-registry Pull pre-built images from Gitea registry instead of rebuilding
--dry-run Show what would happen without executing
--force Continue past non-critical warnings
--branch BRANCH Git branch to pull (default: current branch)
--rollback Rollback to pre-upgrade commit
--api-mode Write progress/result JSON for admin UI
--help Show this help message
Examples:
./scripts/upgrade.sh # Standard upgrade (build from source)
./scripts/upgrade.sh --use-registry # Fast upgrade using pre-built Gitea images
./scripts/upgrade.sh --dry-run # Preview changes
./scripts/upgrade.sh --pull-services # Also update PostgreSQL, Redis, etc.
./scripts/upgrade.sh --rollback # Revert last upgrade
EOF
exit 0
}
while [[ $# -gt 0 ]]; do
case "$1" in
--skip-backup) SKIP_BACKUP=true; shift ;;
--pull-services) PULL_SERVICES=true; shift ;;
--dry-run) DRY_RUN=true; shift ;;
--force) FORCE=true; shift ;;
--branch) BRANCH="$2"; shift 2 ;;
--rollback) ROLLBACK=true; shift ;;
--api-mode) API_MODE=true; shift ;;
--use-registry) USE_REGISTRY=true; shift ;;
--help|-h) show_help ;;
*) error "Unknown option: $1"; echo "Run with --help for usage."; exit 1 ;;
esac
done
# Validate flag combinations
if [[ "$SKIP_BACKUP" == "true" ]] && [[ "$FORCE" != "true" ]]; then
error "--skip-backup requires --force (backup protects your data)"
exit 1
fi
# =============================================================================
# Main
# =============================================================================
START_TIME=$SECONDS
cd "$PROJECT_DIR"
# Setup logging
mkdir -p "$LOG_DIR"
exec > >(tee -a "$LOG_FILE") 2>&1
echo ""
echo -e "${BOLD}${BLUE}══════════════════════════════════════════════════${NC}"
echo -e "${BOLD}${BLUE} Changemaker Lite V2 — Upgrade${NC}"
echo -e "${BOLD}${BLUE} ${TIMESTAMP}${NC}"
echo -e "${BOLD}${BLUE}══════════════════════════════════════════════════${NC}"
if [[ "$DRY_RUN" == "true" ]]; then
echo ""
echo -e " ${YELLOW}DRY RUN — no changes will be made${NC}"
fi
trap on_failure EXIT
# Explicit SIGTERM/SIGINT traps: bash runs EXIT on these too in theory, but
# the marcelle v2.9.2 → v2.9.3 SIGTERM-kill showed no failure result was
# written. Belt-and-suspenders — worst case it fires twice, and write_result
# uses `>` so the second write is idempotent.
trap on_failure TERM INT
acquire_lock
load_env
# Determine branch (source mode only — release installs have no git)
if [[ -z "$BRANCH" ]]; then
if [[ "$INSTALL_MODE" == "release" ]]; then
BRANCH="release"
else
BRANCH="$(git rev-parse --abbrev-ref HEAD)"
fi
fi
# =============================================================================
# Rollback Mode
# =============================================================================
if [[ "$ROLLBACK" == "true" ]]; then
phase "R" "Rollback"
if [[ "$INSTALL_MODE" == "release" ]]; then
# Release-mode rollback: re-extract the prior release tarball recorded
# in VERSION.rollback (seeded at Phase 3 start of any upgrade).
PRIOR_TAG="$(cat "${UPGRADE_DIR}/VERSION.rollback" 2>/dev/null | head -1 || true)"
if [[ -z "$PRIOR_TAG" ]]; then
error "No VERSION.rollback marker found at ${UPGRADE_DIR}/VERSION.rollback"
error "Cannot determine prior release. Run: curl -fSL <prior-tarball-url> | tar xz -C $PROJECT_DIR --strip-components=1"
release_lock
exit 1
fi
info "Rolling back to prior release: ${PRIOR_TAG}"
TARBALL_URL="${GITEA_REGISTRY_URL:-https://gitea.bnkops.com}/admin/changemaker.lite/releases/download/${PRIOR_TAG}/changemaker-lite-${PRIOR_TAG}.tar.gz"
if [[ "$DRY_RUN" == "true" ]]; then
info "[DRY RUN] Would download: $TARBALL_URL"
info "[DRY RUN] Would extract to: $PROJECT_DIR (preserving .env)"
info "[DRY RUN] Would run: docker compose pull api admin media-api nginx && docker compose up -d"
release_lock
exit 0
fi
ROLLBACK_DIR="$(mktemp -d)"
if ! curl -fSL "$TARBALL_URL" -o "${ROLLBACK_DIR}/rb.tar.gz"; then
error "Failed to download prior release tarball from ${TARBALL_URL}"
rm -rf "$ROLLBACK_DIR"
release_lock
exit 1
fi
tar xzf "${ROLLBACK_DIR}/rb.tar.gz" -C "$ROLLBACK_DIR"
ROLLBACK_SRC="$(find "$ROLLBACK_DIR" -maxdepth 1 -mindepth 1 -type d | head -1)"
rsync -a --exclude='.env' "$ROLLBACK_SRC/" "$PROJECT_DIR/"
rm -rf "$ROLLBACK_DIR"
success "Code rolled back to ${PRIOR_TAG}"
export IMAGE_TAG="latest"
docker compose pull api admin media-api nginx || warn "Some images failed to pull — check registry reachability"
docker compose up -d
success "Containers restarted on ${PRIOR_TAG} images"
else
# Source-mode rollback: legacy git-based flow.
LATEST_ARCHIVE="$(ls -t "$BACKUP_DIR"/changemaker-v2-backup-*.tar.gz 2>/dev/null | head -1 || true)"
if [[ -z "$LATEST_ARCHIVE" ]]; then
error "No backup archives found in $BACKUP_DIR"
error "Cannot determine pre-upgrade commit. Manual rollback needed."
release_lock
exit 1
fi
info "Latest backup: $(basename "$LATEST_ARCHIVE")"
ARCHIVE_DIR="$(basename "$LATEST_ARCHIVE" .tar.gz)"
ROLLBACK_COMMIT="$(tar xzf "$LATEST_ARCHIVE" -O "${ARCHIVE_DIR}/git-commit.txt" 2>/dev/null || true)"
if [[ -z "$ROLLBACK_COMMIT" ]]; then
error "No git-commit.txt found in backup archive."
error "Manually specify: git checkout <commit-hash>"
release_lock
exit 1
fi
info "Rolling back to commit: $ROLLBACK_COMMIT"
if [[ "$DRY_RUN" == "true" ]]; then
info "[DRY RUN] Would run: git checkout $ROLLBACK_COMMIT"
info "[DRY RUN] Would rebuild: docker compose build $SOURCE_CONTAINERS"
info "[DRY RUN] Would restart: docker compose up -d"
release_lock
exit 0
fi
git checkout -B "$BRANCH" "$ROLLBACK_COMMIT"
docker compose build $SOURCE_CONTAINERS
docker compose up -d
success "Rolled back to $ROLLBACK_COMMIT"
echo ""
echo -e " ${BOLD}Database restore:${NC}"
echo -e " Code has been rolled back. Database was NOT rolled back."
echo -e " The backup archive contains a PostgreSQL dump."
echo -e " To restore (${RED}DESTRUCTIVE — replaces current data${NC}):"
echo ""
ARCHIVE_DIR_NAME="$(basename "$LATEST_ARCHIVE" .tar.gz)"
echo -e " ${CYAN}tar xzf $LATEST_ARCHIVE -C /tmp${NC}"
echo -e " ${CYAN}gunzip -c /tmp/$ARCHIVE_DIR_NAME/v2-postgres.sql.gz | docker exec -i changemaker-v2-postgres psql -U changemaker -d changemaker_v2${NC}"
echo ""
fi
release_lock
exit 0
fi
# =============================================================================
# Phase 1: Pre-flight Checks
# =============================================================================
phase "1" "Pre-flight Checks"
write_progress 1 "Pre-flight Checks" 5 "Verifying system requirements..."
# Docker
if command -v docker &>/dev/null; then
success "Docker: $(docker --version | head -1)"
else
error "Docker is not installed."
exit 1
fi
if docker compose version &>/dev/null; then
success "Docker Compose: $(docker compose version --short)"
else
error "Docker Compose v2 plugin not found."
exit 1
fi
# Docker daemon running
if docker info &>/dev/null 2>&1; then
success "Docker daemon running"
else
error "Docker daemon not running."
exit 1
fi
# Git
if command -v git &>/dev/null; then
success "Git: $(git --version)"
else
error "Git is not installed."
exit 1
fi
# Remote reachable (source mode only — release mode pulls from Gitea API later)
if [[ "$INSTALL_MODE" == "source" ]]; then
info "Checking git remote..."
if timeout 10 git ls-remote origin HEAD &>/dev/null 2>&1; then
success "Git remote reachable"
else
error "Cannot reach git remote. Check your network or remote configuration."
exit 1
fi
fi
# Working directory checks
if [[ ! -f "$PROJECT_DIR/docker-compose.yml" ]]; then
error "docker-compose.yml not found. Are you in the project root?"
exit 1
fi
if [[ ! -f "$PROJECT_DIR/.env" ]]; then
error ".env not found. Run ./config.sh first."
exit 1
fi
success "Project files verified"
# Disk space
AVAILABLE_MB=$(df -m "$PROJECT_DIR" | awk 'NR==2 {print $4}')
if [[ "$AVAILABLE_MB" -lt "$MIN_DISK_MB" ]]; then
error "Insufficient disk space: ${AVAILABLE_MB}MB available, ${MIN_DISK_MB}MB required."
exit 1
fi
success "Disk space: ${AVAILABLE_MB}MB available"
# Record pre-upgrade state
if [[ "$INSTALL_MODE" == "source" ]]; then
PRE_UPGRADE_COMMIT="$(git rev-parse HEAD)"
PRE_UPGRADE_SHORT="$(git rev-parse --short HEAD)"
info "Current commit: $PRE_UPGRADE_SHORT ($(git log -1 --format='%s' HEAD))"
else
# Release mode: derive "commit" from VERSION file (format: <tag>\n<sha>)
PRE_UPGRADE_COMMIT="$(head -2 "$PROJECT_DIR/VERSION" 2>/dev/null | tail -1 || echo "release")"
PRE_UPGRADE_SHORT="$(head -1 "$PROJECT_DIR/VERSION" 2>/dev/null || echo "release")"
info "Current version: $PRE_UPGRADE_SHORT"
fi
info "Target branch: $BRANCH"
# Record running containers (for restoring monitoring profile later)
MONITORING_WAS_RUNNING=false
if docker ps --format '{{.Names}}' | grep -q 'prometheus-changemaker'; then
MONITORING_WAS_RUNNING=true
info "Monitoring stack detected (will restart after upgrade)"
fi
# Source-mode-only checks: dirty files + upstream commit comparison
if [[ "$INSTALL_MODE" == "source" ]]; then
# Warn about uncommitted changes in project-owned paths
PROJECT_OWNED_PATHS="api/ admin/ docker-compose.yml"
DIRTY_PROJECT_FILES="$(git diff --name-only HEAD -- $PROJECT_OWNED_PATHS 2>/dev/null || true)"
if [[ -n "$DIRTY_PROJECT_FILES" ]]; then
warn "Uncommitted changes in project-owned files:"
echo "$DIRTY_PROJECT_FILES" | while read -r f; do echo " $f"; done
if [[ "$FORCE" != "true" ]]; then
error "Commit or stash these changes first, or use --force to continue."
exit 1
fi
warn "Continuing with --force (changes will be stashed)"
fi
# Check for available updates
LOCAL_HEAD="$(git rev-parse HEAD)"
REMOTE_HEAD="$(git ls-remote origin "$BRANCH" | cut -f1)"
if [[ "$LOCAL_HEAD" == "$REMOTE_HEAD" ]]; then
info "Already up to date ($PRE_UPGRADE_SHORT). No upstream changes."
if [[ "$FORCE" != "true" ]]; then
success "Nothing to upgrade."
release_lock
exit 0
fi
warn "Continuing with --force despite no upstream changes."
fi
fi
# Release mode: the upstream-version comparison happens later in the
# release-mode block (line ~597) which queries the Gitea Releases API.
# =============================================================================
# Phase 2: Backup
# =============================================================================
phase "2" "Backup"
write_progress 2 "Backup" 15 "Creating backup..."
if [[ "$SKIP_BACKUP" == "true" ]]; then
warn "Backup skipped (--skip-backup --force)"
else
# Run existing backup script
if [[ -x "$SCRIPT_DIR/backup.sh" ]]; then
if [[ "$DRY_RUN" == "true" ]]; then
info "[DRY RUN] Would run: scripts/backup.sh"
else
info "Running database backup..."
"$SCRIPT_DIR/backup.sh"
success "Database backup complete"
fi
else
warn "scripts/backup.sh not found or not executable, skipping database backup"
fi
# Archive user-modifiable content
USER_BACKUP="${BACKUP_DIR}/upgrade-user-content-${TIMESTAMP}.tar.gz"
USER_BACKUP_FILES=()
for p in "${USER_PATHS[@]}"; do
if [[ -e "$PROJECT_DIR/$p" ]]; then
USER_BACKUP_FILES+=("$p")
fi
done
if [[ ${#USER_BACKUP_FILES[@]} -gt 0 ]]; then
if [[ "$DRY_RUN" == "true" ]]; then
info "[DRY RUN] Would archive user content: ${USER_BACKUP_FILES[*]}"
else
mkdir -p "$BACKUP_DIR"
tar -czf "$USER_BACKUP" -C "$PROJECT_DIR" "${USER_BACKUP_FILES[@]}" 2>/dev/null || true
success "User content archived: $(du -h "$USER_BACKUP" | cut -f1)"
fi
fi
# Save pre-upgrade commit hash for rollback reference
LATEST_BACKUP="$(ls -t "$BACKUP_DIR"/changemaker-v2-backup-*.tar.gz 2>/dev/null | head -1 || true)"
if [[ -n "$LATEST_BACKUP" ]] && [[ "$DRY_RUN" != "true" ]]; then
# Append git-commit.txt into the latest backup archive
COMMIT_TMPDIR="$(mktemp -d)"
ARCHIVE_BASENAME="$(basename "$LATEST_BACKUP" .tar.gz)"
mkdir -p "$COMMIT_TMPDIR/$ARCHIVE_BASENAME"
echo "$PRE_UPGRADE_COMMIT" > "$COMMIT_TMPDIR/$ARCHIVE_BASENAME/git-commit.txt"
# Re-pack: extract, add file, recompress
tar xzf "$LATEST_BACKUP" -C "$COMMIT_TMPDIR" 2>/dev/null || true
tar czf "$LATEST_BACKUP" -C "$COMMIT_TMPDIR" "$ARCHIVE_BASENAME"
rm -rf "$COMMIT_TMPDIR"
success "Saved commit reference ($PRE_UPGRADE_SHORT) in backup archive"
fi
fi
# =============================================================================
# Phase 3: Code Update
# =============================================================================
phase "3" "Code Update"
write_progress 3 "Code Update" 30 "Pulling latest code..."
# --- Release mode: download tarball instead of git pull ---
if [[ "$INSTALL_MODE" == "release" ]]; then
GITEA_API="${GITEA_REGISTRY_URL:-https://gitea.bnkops.com}/api/v1"
CURRENT_VERSION=$(head -1 "$PROJECT_DIR/VERSION" 2>/dev/null || echo "unknown")
info "Release mode — checking for updates (current: ${CURRENT_VERSION})..."
RELEASE_JSON=$(curl -sf "${GITEA_API}/repos/admin/changemaker.lite/releases/latest" 2>/dev/null || true)
if [[ -z "$RELEASE_JSON" ]]; then
error "Could not reach Gitea API. Check network or GITEA_REGISTRY_URL."
exit 1
fi
LATEST_TAG=$(echo "$RELEASE_JSON" | python3 -c "import sys,json; print(json.load(sys.stdin).get('tag_name',''))" 2>/dev/null)
TARBALL_URL=$(echo "$RELEASE_JSON" | python3 -c "
import sys, json
for a in json.load(sys.stdin).get('assets', []):
if a['name'].endswith('.tar.gz'):
print(a['browser_download_url']); break
" 2>/dev/null || true)
if [[ "$CURRENT_VERSION" == "$LATEST_TAG" ]] && [[ "$FORCE" != "true" ]]; then
info "Already at latest version: ${CURRENT_VERSION}"
write_progress 3 "Code Update" 45 "Already up to date"
elif [[ -z "$TARBALL_URL" ]]; then
error "No tarball found in release ${LATEST_TAG}"
exit 1
else
info "Updating ${CURRENT_VERSION}${LATEST_TAG}..."
write_progress 3 "Code Update" 35 "Downloading ${LATEST_TAG}..."
# Download
DOWNLOAD_DIR=$(mktemp -d)
curl -fSL "$TARBALL_URL" -o "${DOWNLOAD_DIR}/update.tar.gz"
tar xzf "${DOWNLOAD_DIR}/update.tar.gz" -C "$DOWNLOAD_DIR"
UPDATE_SRC=$(find "$DOWNLOAD_DIR" -maxdepth 1 -mindepth 1 -type d | head -1)
# Save user paths
save_user_paths
# Sync new files, preserving .env. VERSION is staged to a pending
# location and only promoted after Phase 7 verification succeeds (Fix B),
# so interrupted upgrades don't leave a misleading "upgraded" marker.
# Also stash the CURRENT VERSION as VERSION.rollback so --rollback and
# print_rollback_help know what release to restore on failure.
write_progress 3 "Code Update" 40 "Applying update..."
mkdir -p "$UPGRADE_DIR"
if [[ -f "$PROJECT_DIR/VERSION" ]]; then
cp "$PROJECT_DIR/VERSION" "$UPGRADE_DIR/VERSION.rollback"
fi
rsync -a --exclude='.env' --exclude='VERSION' "$UPDATE_SRC/" "$PROJECT_DIR/"
cp "$UPDATE_SRC/VERSION" "$UPGRADE_DIR/VERSION.pending"
# Restore user paths
restore_user_paths
# Restore tracked files that may have been overwritten
DELETED_TRACKED="$(git ls-files --deleted 2>/dev/null || true)"
if [[ -n "$DELETED_TRACKED" ]]; then
echo "$DELETED_TRACKED" | xargs git checkout HEAD -- 2>/dev/null || true
fi
rm -rf "$DOWNLOAD_DIR"
success "Updated to ${LATEST_TAG}"
fi
# Skip the git-based update flow below
POST_PULL_COMMIT="$(head -2 "$PROJECT_DIR/VERSION" | tail -1 2>/dev/null || echo "release")"
elif [[ "$DRY_RUN" == "true" ]]; then
info "[DRY RUN] Would fetch and show incoming changes:"
git fetch origin "$BRANCH" 2>/dev/null || true
INCOMING="$(git log --oneline HEAD..origin/"$BRANCH" 2>/dev/null || echo "(unable to preview)")"
if [[ -n "$INCOMING" ]]; then
echo "$INCOMING"
else
info "No new commits to pull."
fi
info "[DRY RUN] Would preserve user-modifiable paths: ${USER_PATHS[*]}"
info "[DRY RUN] Would stash local changes, pull, and pop stash"
release_lock
exit 0
fi
# Source-mode git pull flow. Release mode handles its update via tarball
# download in the block above and skips this entire section.
if [[ "$INSTALL_MODE" == "source" ]]; then
# Step 0: Save user-modifiable paths before any git operations
save_user_paths
# Step 0b: Clear skip-worktree flags that prevent merge (e.g., repo-data JSON files)
SKIP_WORKTREE_FILES="$(git ls-files -v | grep '^S ' | awk '{print $2}' || true)"
if [[ -n "$SKIP_WORKTREE_FILES" ]]; then
info "Clearing skip-worktree flags on $(echo "$SKIP_WORKTREE_FILES" | wc -l | xargs) file(s)..."
echo "$SKIP_WORKTREE_FILES" | xargs git update-index --no-skip-worktree
success "Skip-worktree flags cleared"
fi
# Step 0c: Fix Docker-owned directories that block git checkout
for owned_dir in api/upgrade api/uploads api/configs; do
if [[ -d "$PROJECT_DIR/$owned_dir" ]] && [[ ! -w "$PROJECT_DIR/$owned_dir" ]]; then
info "Fixing permissions on $owned_dir..."
docker run --rm -v "$PROJECT_DIR/$owned_dir:/fix" alpine chown -R "$(id -u):$(id -g)" /fix 2>/dev/null || true
fi
done
# Step 1: Stash user changes if any exist
HAS_CHANGES=false
if [[ -n "$(git status --porcelain 2>/dev/null)" ]]; then
HAS_CHANGES=true
STASH_NAME="upgrade-${TIMESTAMP}"
info "Stashing local changes as '$STASH_NAME'..."
git stash push --include-untracked -m "$STASH_NAME"
success "Local changes stashed"
fi
# Step 3: Pull updates
info "Pulling updates from origin/$BRANCH..."
if ! git pull origin "$BRANCH" --no-edit 2>&1; then
error "git pull failed. This may indicate upstream force-push or branch issues."
if [[ "$HAS_CHANGES" == "true" ]]; then
warn "Your stashed changes can be recovered with: git stash pop"
fi
exit 1
fi
POST_PULL_COMMIT="$(git rev-parse --short HEAD)"
success "Updated to $POST_PULL_COMMIT"
# Step 4: Pop stash and handle conflicts
if [[ "$HAS_CHANGES" == "true" ]]; then
info "Restoring local changes..."
if git stash pop 2>&1; then
success "Local changes restored cleanly"
else
warn "Merge conflicts detected during stash pop"
# Auto-resolve user-modifiable paths by keeping user's version
RESOLVED_COUNT=0
for user_path in "${USER_PATHS[@]}"; do
CONFLICTED="$(git diff --name-only --diff-filter=U -- "$user_path" 2>/dev/null || true)"
if [[ -n "$CONFLICTED" ]]; then
while IFS= read -r cf; do
info " Auto-resolving (keeping yours): $cf"
git checkout --theirs "$cf" 2>/dev/null || true
git add "$cf"
RESOLVED_COUNT=$((RESOLVED_COUNT + 1))
done < <(echo "$CONFLICTED")
fi
done
# Check if any conflicts remain in project-owned files
REMAINING_CONFLICTS="$(git diff --name-only --diff-filter=U 2>/dev/null || true)"
if [[ -n "$REMAINING_CONFLICTS" ]]; then
error "Unresolved conflicts in project-owned files:"
echo "$REMAINING_CONFLICTS" | while read -r f; do echo " $f"; done
echo ""
error "These files have upstream changes that conflict with your edits."
error "Resolve manually, then run the upgrade again."
info "Your pre-upgrade commit: $PRE_UPGRADE_COMMIT"
info "To abort: git merge --abort OR git checkout $PRE_UPGRADE_COMMIT"
exit 1
fi
if [[ $RESOLVED_COUNT -gt 0 ]]; then
success "Auto-resolved $RESOLVED_COUNT user-modifiable path(s) (kept your versions)"
fi
fi
fi
# Step 4b: Restore user-modifiable paths (unconditionally overwrites with saved copies)
restore_user_paths
# Step 4c: Restore any tracked files accidentally deleted by restore_user_paths
# (can happen when save_user_paths can't read root-owned files in user paths)
DELETED_TRACKED="$(git ls-files --deleted 2>/dev/null || true)"
if [[ -n "$DELETED_TRACKED" ]]; then
info "Restoring $(echo "$DELETED_TRACKED" | wc -l | xargs) tracked file(s) deleted during restore..."
echo "$DELETED_TRACKED" | xargs git checkout HEAD -- 2>/dev/null || true
success "Tracked files restored from HEAD"
fi
fi
# End of source-mode git pull flow
# Step 5: Detect new env vars
info "Checking for new environment variables..."
if [[ -f "$PROJECT_DIR/.env.example" ]] && [[ -f "$PROJECT_DIR/.env" ]]; then
NEW_VARS=()
while IFS='=' read -r key value; do
[[ -z "$key" || "$key" =~ ^[[:space:]]*# ]] && continue
key="$(echo "$key" | xargs)"
[[ ! "$key" =~ ^[a-zA-Z_][a-zA-Z0-9_]*$ ]] && continue
if ! grep -q "^${key}=" "$PROJECT_DIR/.env" 2>/dev/null; then
# Strip inline comments and trim whitespace before appending
value="${value%%#*}"
value="$(echo "$value" | xargs)"
echo "${key}=${value}" >> "$PROJECT_DIR/.env"
NEW_VARS+=("$key")
fi
done < "$PROJECT_DIR/.env.example"
if [[ ${#NEW_VARS[@]} -gt 0 ]]; then
warn "New env vars added to .env (review defaults):"
for v in "${NEW_VARS[@]}"; do
echo -e " ${CYAN}$v${NC}"
done
else
success "No new environment variables"
fi
fi
# Step 6: Print update summary (source mode only — release mode has no commit range)
COMMIT_COUNT=0
if [[ "$INSTALL_MODE" == "source" ]]; then
COMMIT_RANGE="${PRE_UPGRADE_SHORT}..${POST_PULL_COMMIT}"
# Use || true and check pipefail-safe to survive git failures
COMMIT_COUNT="$(git log --oneline "$PRE_UPGRADE_COMMIT..HEAD" 2>/dev/null | wc -l | xargs || echo 0)"
echo ""
info "Update summary: $COMMIT_COUNT commit(s) ($COMMIT_RANGE)"
git log --oneline "$PRE_UPGRADE_COMMIT..HEAD" 2>/dev/null | head -20 || true
if [[ "$COMMIT_COUNT" -gt 20 ]]; then
info " ... and $((COMMIT_COUNT - 20)) more"
fi
# Flag commits that may require manual attention
BREAKING_COMMITS="$(git log --oneline "$PRE_UPGRADE_COMMIT..HEAD" --grep="BREAKING" --grep="\[manual\]" 2>/dev/null || true)"
if [[ -n "$BREAKING_COMMITS" ]]; then
echo ""
warn "Commits requiring manual attention:"
echo "$BREAKING_COMMITS" | while read -r line; do
echo -e " ${YELLOW}$line${NC}"
done
fi
else
info "Update summary: ${PRE_UPGRADE_SHORT} → release"
fi
# =============================================================================
# Phase 4: Container Rebuild
# =============================================================================
phase "4" "Container Rebuild"
write_progress 4 "Container Rebuild" 50 "Preparing containers..."
CHANGED_FILES="$(git diff --name-only "$PRE_UPGRADE_COMMIT..HEAD" 2>/dev/null || true)"
if [[ "$USE_REGISTRY" == "true" ]]; then
# --- Registry pull path: pull pre-built production images from Gitea ---
REGISTRY="${GITEA_REGISTRY:-gitea.bnkops.com/admin}"
REGISTRY_TAG="$(git rev-parse --short HEAD 2>/dev/null || echo "latest")"
export GITEA_REGISTRY="$REGISTRY"
export IMAGE_TAG="$REGISTRY_TAG"
export BUILD_TARGET=production
info "Registry mode: ${REGISTRY} (tag: ${REGISTRY_TAG})"
write_progress 4 "Container Rebuild" 55 "Pulling images from registry..."
# Pull core app containers: try SHA tag → :latest fallback → source build
# NOTE: stderr intentionally flows through so slow/broken pulls are visible
# in logs/upgrade-watcher.log. Previously silenced, which left the v2.9.3
# systemd-killed upgrade with zero error trace.
PULLED_TAG=""
if docker compose pull api admin media-api; then
success "Core images pulled from registry (tag: ${REGISTRY_TAG})"
PULLED_TAG="$REGISTRY_TAG"
elif [[ "$REGISTRY_TAG" != "latest" ]]; then
warn "Tag :${REGISTRY_TAG} not in registry — trying :latest"
export IMAGE_TAG="latest"
if docker compose pull api admin media-api; then
success "Core images pulled from registry (tag: latest)"
PULLED_TAG="latest"
# Retag :latest as :SHA so compose up uses consistent tags
for svc in api admin media-api; do
local_img="${REGISTRY}/changemaker-${svc}"
docker tag "${local_img}:latest" "${local_img}:${REGISTRY_TAG}" 2>/dev/null || true
done
export IMAGE_TAG="$REGISTRY_TAG"
else
warn "Registry pull failed for :latest too — falling back to source build"
export IMAGE_TAG="$REGISTRY_TAG"
docker compose build $SOURCE_CONTAINERS
success "Source containers rebuilt (registry fallback)"
fi
else
warn "Registry pull failed — falling back to source build"
docker compose build $SOURCE_CONTAINERS
success "Source containers rebuilt (registry fallback)"
fi
# nginx: try SHA → :latest → rebuild if config changed
NGINX_PULLED=false
if docker compose pull nginx; then
success "nginx pulled from registry (tag: ${IMAGE_TAG})"
NGINX_PULLED=true
elif [[ "$REGISTRY_TAG" != "latest" ]]; then
export IMAGE_TAG="latest"
if docker compose pull nginx; then
docker tag "${REGISTRY}/changemaker-nginx:latest" "${REGISTRY}/changemaker-nginx:${REGISTRY_TAG}" 2>/dev/null || true
success "nginx pulled from registry (tag: latest)"
NGINX_PULLED=true
fi
export IMAGE_TAG="$REGISTRY_TAG"
fi
if [[ "$NGINX_PULLED" == "false" ]]; then
if echo "$CHANGED_FILES" | grep -q "^nginx/"; then
info "Rebuilding nginx (config changed, not in registry)..."
docker compose build nginx
success "nginx rebuilt"
else
info "nginx unchanged, skipping rebuild"
fi
fi
else
# --- Source build path (original behaviour) ---
info "Rebuilding source containers: $SOURCE_CONTAINERS"
docker compose build $SOURCE_CONTAINERS
success "Source containers rebuilt"
# Conditionally rebuild containers whose Dockerfiles changed
for svc in $CONDITIONAL_CONTAINERS; do
case "$svc" in
nginx)
if echo "$CHANGED_FILES" | grep -q "^nginx/"; then
info "Rebuilding nginx (config changed)..."
docker compose build nginx
success "nginx rebuilt"
else
info "nginx unchanged, skipping rebuild"
fi
;;
esac
done
fi
# Optionally pull third-party images
if [[ "$PULL_SERVICES" == "true" ]]; then
info "Pulling latest third-party images..."
docker compose pull v2-postgres redis listmonk-app listmonk-db gitea-app nocodb-v2 mailhog || true
success "Third-party images updated"
# Record image digests for audit trail
info "Recording image digests for audit trail..."
docker compose images --format json 2>/dev/null | \
python3 -c "import sys,json; [print(f' {i[\"Repository\"]}:{i[\"Tag\"]} -> {i[\"ID\"][:12]}') for i in json.load(sys.stdin)]" \
2>/dev/null || true
fi
# =============================================================================
# Phase 5: Database Migration
# =============================================================================
phase "5" "Database Migration"
write_progress 5 "Database Migration" 55 "Checking database state..."
# Ensure infrastructure is running and healthy
info "Ensuring infrastructure is up..."
docker compose up -d $INFRA_CONTAINERS
# Wait for PostgreSQL to be ready
info "Waiting for PostgreSQL..."
PG_WAIT=0
PG_TIMEOUT=60
while ! docker compose exec -T v2-postgres pg_isready -U "${V2_POSTGRES_USER:-changemaker}" &>/dev/null 2>&1; do
sleep 2
PG_WAIT=$((PG_WAIT + 2))
if [[ $PG_WAIT -ge $PG_TIMEOUT ]]; then
error "PostgreSQL did not become ready within ${PG_TIMEOUT}s"
exit 1
fi
done
success "PostgreSQL ready (${PG_WAIT}s)"
# Check for failed/incomplete migrations
info "Checking for failed migrations..."
FAILED_MIGRATIONS="$(docker compose exec -T v2-postgres psql -U "${V2_POSTGRES_USER:-changemaker}" -d "${V2_POSTGRES_DB:-changemaker_v2}" -t -A -c "
SELECT migration_name FROM _prisma_migrations
WHERE rolled_back_at IS NOT NULL
OR (finished_at IS NULL AND started_at IS NOT NULL
AND started_at < NOW() - INTERVAL '10 minutes')
" 2>/dev/null || true)"
if [[ -n "$FAILED_MIGRATIONS" ]]; then
warn "Found failed/incomplete migrations — auto-resolving..."
while IFS= read -r migration_name; do
[[ -z "$migration_name" ]] && continue
info " Resolving: $migration_name"
docker compose run --rm --no-deps --entrypoint "" api \
npx prisma migrate resolve --applied "$migration_name" 2>&1 || {
warn " Could not auto-resolve $migration_name (may need manual intervention)"
}
done <<< "$FAILED_MIGRATIONS"
success "Failed migrations resolved"
else
success "No failed migrations found"
fi
# Preview pending migrations before applying
info "Checking pending migrations..."
PENDING_OUTPUT="$(docker compose run --rm --no-deps --entrypoint "" api \
npx prisma migrate status 2>&1 || true)"
if echo "$PENDING_OUTPUT" | grep -q "Following migration"; then
info "Pending migrations to apply:"
echo "$PENDING_OUTPUT" | grep -E "^\s+[0-9]" | while read -r line; do
echo " $line"
done
fi
# Run migrations in a one-off container (catches errors here, not in a restart loop)
info "Running database migrations..."
write_progress 5 "Database Migration" 60 "Applying migrations..."
if ! docker compose run --rm --no-deps --entrypoint "" api \
npx prisma migrate deploy 2>&1; then
error "Database migration failed!"
error ""
error "Common fixes:"
error " 1. Check migration status:"
error " docker compose exec v2-postgres psql -U changemaker -d changemaker_v2 \\"
error " -c \"SELECT migration_name, finished_at, rolled_back_at FROM _prisma_migrations ORDER BY started_at DESC LIMIT 10;\""
error " 2. Mark a stuck migration as applied:"
error " docker compose run --rm --no-deps --entrypoint '' api npx prisma migrate resolve --applied <migration_name>"
error " 3. Check logs: docker compose logs api --tail 50"
error ""
error "After fixing, re-run: ./scripts/upgrade.sh --force --skip-backup"
exit 1
fi
# Count applied migrations
MIGRATION_COUNT="$(docker compose exec -T v2-postgres psql -U "${V2_POSTGRES_USER:-changemaker}" -d "${V2_POSTGRES_DB:-changemaker_v2}" -t -A -c "
SELECT COUNT(*) FROM _prisma_migrations WHERE finished_at IS NOT NULL
" 2>/dev/null || echo "?")"
success "Migrations up to date ($MIGRATION_COUNT total applied)"
# Run database seed (idempotent)
info "Running database seed..."
write_progress 5 "Database Migration" 65 "Seeding database..."
if ! docker compose run --rm --no-deps --entrypoint "" api \
npx prisma db seed 2>&1; then
warn "Database seed had warnings (non-fatal, continuing)"
fi
success "Database seed complete"
# Verify migration state is clean (no drift)
info "Verifying migration state..."
MIGRATE_STATUS="$(docker compose run --rm --no-deps --entrypoint "" api \
npx prisma migrate status 2>&1 || true)"
if echo "$MIGRATE_STATUS" | grep -qiE "failed|drift|out of sync"; then
error "Schema drift detected after migration!"
echo "$MIGRATE_STATUS"
exit 1
fi
success "Schema state verified — no drift"
# =============================================================================
# Phase 6: Service Restart
# =============================================================================
phase "6" "Service Restart"
write_progress 6 "Service Restart" 70 "Restarting services..."
# Graceful shutdown with extended drain period (allow in-flight requests to complete)
STOP_TIMEOUT=30
info "Stopping application containers (${STOP_TIMEOUT}s grace period)..."
docker compose stop -t $STOP_TIMEOUT $APP_CONTAINERS 2>/dev/null || true
success "Application containers stopped"
# Force-recreate LSIO containers to prevent anonymous volume shadowing bind mounts.
# LSIO images define a VOLUME at /config in their Dockerfile. When a container is
# merely restarted, Docker reuses the old anonymous volume whose /config/www is empty,
# which shadows the bind mount (e.g., ./mkdocs/site:/config/www → 403 Forbidden).
# Removing the container first ensures a fresh anonymous volume that respects bind mounts.
info "Removing LSIO containers (clearing anonymous volumes)..."
docker compose rm -sf $LSIO_VOLUME_CONTAINERS 2>/dev/null || true
success "LSIO containers cleared for fresh recreation"
# Verify Gancio config.json exists before starting services
verify_gancio_config
# Detect if npm dependencies changed (stale anonymous volumes cause missing modules)
NEEDS_VOLUME_REFRESH=false
if echo "$CHANGED_FILES" | grep -qE "^(api|admin)/(package\.json|package-lock\.json)"; then
NEEDS_VOLUME_REFRESH=true
warn "Package dependencies changed — will recreate containers with fresh volumes"
fi
# Start API (migrations already applied in Phase 5)
info "Starting API..."
if [[ "$NEEDS_VOLUME_REFRESH" == "true" ]]; then
info "Removing old API/admin containers (clearing stale node_modules volumes)..."
docker compose rm -sfv api admin 2>/dev/null || true
fi
docker compose up -d api
# Poll API health check
info "Waiting for API health check..."
API_WAIT=0
while true; do
if docker compose exec -T api wget -q --spider http://localhost:4000/api/health 2>/dev/null; then
break
fi
# Detect container crash early (don't wait full timeout)
if ! docker compose ps api --format '{{.State}}' 2>/dev/null | grep -q "running"; then
error "API container exited unexpectedly"
docker compose logs api --tail 20
exit 1
fi
sleep $HEALTH_INTERVAL
API_WAIT=$((API_WAIT + HEALTH_INTERVAL))
if [[ $API_WAIT -ge $HEALTH_TIMEOUT ]]; then
error "API did not become healthy within ${HEALTH_TIMEOUT}s"
error "Check logs: docker compose logs api --tail 50"
exit 1
fi
done
success "API healthy (${API_WAIT}s)"
# Start everything else (exclude one-shot init containers)
info "Starting remaining services..."
docker compose up -d \
--scale listmonk-init=0 \
--scale gancio-init=0 \
--scale vaultwarden-init=0
success "All services started"
# Restart Pangolin tunnel connector if running (may hold stale state after nginx rebuild)
if docker ps --format '{{.Names}}' | grep -q 'newt'; then
info "Restarting Pangolin tunnel connector..."
docker compose restart newt 2>/dev/null || true
success "Newt tunnel restarted"
fi
# Restart monitoring if it was running before
if [[ "$MONITORING_WAS_RUNNING" == "true" ]]; then
info "Restarting monitoring stack..."
if docker compose --profile monitoring up -d 2>&1; then
success "Monitoring stack restarted"
else
warn "Monitoring stack restart had errors (non-fatal, services may already be running)"
fi
fi
# =============================================================================
# Phase 7: Post-Upgrade Verification
# =============================================================================
phase "7" "Post-Upgrade Verification"
write_progress 7 "Verification" 90 "Running health checks..."
VERIFY_FAILED=false
# Polling health check helper (retries for up to MAX_WAIT seconds)
verify_service_health() {
local name="$1" check_cmd="$2" max_wait="${3:-30}"
local waited=0
while [[ $waited -lt $max_wait ]]; do
if eval "$check_cmd" 2>/dev/null; then
success "$name: healthy (${waited}s)"
return 0
fi
sleep 3
waited=$((waited + 3))
done
warn "$name: not responding after ${max_wait}s"
VERIFY_FAILED=true
# Always return 0 — under set -e a non-zero return from this helper would
# exit the script before write_result runs. The VERIFY_FAILED flag is the
# signal the caller actually checks.
return 0
}
# API health (with polling — may still be running migrations)
verify_service_health "API (port 4000)" \
"docker compose exec -T api wget -q --spider http://localhost:4000/api/health" 45
# Admin health — 90s matches the admin container's start_period + a cushion
# for first-boot Vite bundling. 30s was aspirational and produced cry-wolf
# warnings on every successful upgrade.
verify_service_health "Admin (port 3000)" \
"docker compose exec -T admin wget -q --spider http://localhost:3000/" 90
# Media API health (optional — may not be enabled)
if docker ps --format '{{.Names}}' | grep -q 'changemaker-media-api'; then
verify_service_health "Media API (port 4100)" \
"docker compose exec -T media-api wget -q --spider http://127.0.0.1:4100/health" 30
fi
# Gancio health (optional) — restart loop is still a hard signal, but
# "starting" now gets retry grace instead of passing silently.
if docker ps --format '{{.Names}}' | grep -q 'gancio-changemaker'; then
if docker compose ps gancio --format '{{.Status}}' 2>/dev/null | grep -qi "restarting"; then
warn "Gancio: restart loop detected (check config.json in gancio-data volume)"
VERIFY_FAILED=true
else
verify_service_health "Gancio" \
"docker compose ps gancio --format '{{.Status}}' 2>/dev/null | grep -q healthy" 60
fi
fi
# MkDocs static site health (retry — first-boot rebuild can lag)
if docker ps --format '{{.Names}}' | grep -q 'mkdocs-site-server'; then
verify_service_health "MkDocs site (port ${MKDOCS_SITE_SERVER_PORT:-4004})" \
"curl -sf http://localhost:${MKDOCS_SITE_SERVER_PORT:-4004}/ -o /dev/null" 30
fi
# Check for containers in restart loop
RESTARTING="$(docker compose ps 2>/dev/null | grep -i "restarting" || true)"
if [[ -n "$RESTARTING" ]]; then
warn "Containers in restart loop:"
echo "$RESTARTING"
VERIFY_FAILED=true
fi
if [[ "$VERIFY_FAILED" == "true" ]]; then
warn "Some health checks failed. Services may still be starting."
info "Check logs: docker compose logs --tail 50"
UPGRADE_WARNINGS='["Some health checks failed after upgrade — services may still be starting"]'
else
success "All health checks passed"
fi
# --- External reachability probe (Fix C) ---
# Non-fatal: a Pangolin resource misassignment or DNS flap wouldn't be
# caught by the localhost-only checks above. Warn (don't fail) because
# transient tunnel issues should not roll back a successful upgrade.
if [[ -n "${DOMAIN:-}" ]] && command -v curl >/dev/null 2>&1; then
info "Probing external API at https://api.${DOMAIN}/api/health ..."
EXT_CODE="$(curl -s -o /dev/null -w '%{http_code}' --max-time 10 \
"https://api.${DOMAIN}/api/health" 2>/dev/null || echo "000")"
if [[ "$EXT_CODE" == "200" ]]; then
success "External API reachable (HTTP 200)"
else
warn "External API probe returned HTTP ${EXT_CODE} — check Pangolin tunnel"
UPGRADE_WARNINGS="$(python3 - "$UPGRADE_WARNINGS" "$DOMAIN" "$EXT_CODE" <<'PYEOF' 2>/dev/null || echo "$UPGRADE_WARNINGS"
import json, sys
try:
w = json.loads(sys.argv[1]) if sys.argv[1] else []
except Exception:
w = []
if not isinstance(w, list):
w = []
w.append(f"External API https://api.{sys.argv[2]}/api/health returned HTTP {sys.argv[3]}")
print(json.dumps(w))
PYEOF
)"
fi
fi
# --- Atomic VERSION promotion (Fix B) ---
# The staged VERSION from Phase 3 lands now that we've reached the end of
# Phase 7 without on_failure firing. Promote regardless of VERIFY_FAILED —
# that flag is a soft health-check warning (e.g. "admin slow to respond"),
# not an upgrade failure. The tarball is extracted, containers are up, and
# write_result below will record success=true. Gating promotion on
# VERIFY_FAILED previously caused a "stuck at old VERSION" bug where a
# transient admin healthcheck warning pinned the install back.
# Hard failures (SIGTERM, exit !=0) still prevent promotion via on_failure,
# which rm -f's VERSION.pending before it can be promoted.
if [[ -f "$UPGRADE_DIR/VERSION.pending" ]]; then
mv "$UPGRADE_DIR/VERSION.pending" "$PROJECT_DIR/VERSION"
success "VERSION promoted to $(head -1 "$PROJECT_DIR/VERSION" 2>/dev/null || echo "?")"
fi
# =============================================================================
# Summary
# =============================================================================
ELAPSED="$(elapsed)"
if [[ "$INSTALL_MODE" == "source" ]]; then
FINAL_COMMIT="$(git rev-parse --short HEAD)"
else
FINAL_COMMIT="$(head -1 "$PROJECT_DIR/VERSION" 2>/dev/null || echo "release")"
fi
write_progress 7 "Verification" 100 "Upgrade complete!"
write_result "true" "Upgraded ${PRE_UPGRADE_SHORT}${FINAL_COMMIT} (${COMMIT_COUNT} commits)" "$UPGRADE_WARNINGS"
archive_success_to_history "Upgraded ${PRE_UPGRADE_SHORT}${FINAL_COMMIT} (${COMMIT_COUNT} commits)"
echo ""
echo -e "${BOLD}${GREEN}══════════════════════════════════════════════════${NC}"
echo -e "${BOLD}${GREEN} Upgrade Complete${NC}"
echo -e "${BOLD}${GREEN}══════════════════════════════════════════════════${NC}"
echo ""
echo -e " ${BOLD}Previous:${NC} $PRE_UPGRADE_SHORT"
if [[ "$INSTALL_MODE" == "source" ]]; then
echo -e " ${BOLD}Current:${NC} $FINAL_COMMIT ($(git log -1 --format='%s' HEAD 2>/dev/null || echo "$FINAL_COMMIT"))"
else
echo -e " ${BOLD}Current:${NC} $FINAL_COMMIT"
fi
echo -e " ${BOLD}Commits:${NC} $COMMIT_COUNT"
echo -e " ${BOLD}Duration:${NC} $ELAPSED"
echo -e " ${BOLD}Log:${NC} $LOG_FILE"
echo ""
release_lock
trap - EXIT