#!/usr/bin/env bash # ============================================================================= # Changemaker Lite V2 — Upgrade Script # Safely pulls updates, rebuilds containers, and restarts services. # Usage: ./scripts/upgrade.sh [OPTIONS] # ============================================================================= set -euo pipefail # --- Configuration --- SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" PROJECT_DIR="$(dirname "$SCRIPT_DIR")" TIMESTAMP="$(date +%Y%m%d_%H%M%S)" LOG_DIR="${PROJECT_DIR}/logs" LOG_FILE="${LOG_DIR}/upgrade-${TIMESTAMP}.log" LOCK_FILE="${PROJECT_DIR}/.upgrade.lock" BACKUP_DIR="${BACKUP_DIR:-$PROJECT_DIR/backups}" HEALTH_TIMEOUT=120 HEALTH_INTERVAL=5 MIN_DISK_MB=2048 # Tracks which phase the upgrade is currently in, so on_failure can report # "killed during Phase 4: Container Rebuild" instead of just an exit code. CURRENT_PHASE_NAME="" # Warnings accumulated during the run — surfaced in result.json. Global so # Phase 7 probes (external reachability) can append without losing earlier # entries set in the Phase 7 verification block. UPGRADE_WARNINGS="[]" # Source-built containers (always rebuilt) SOURCE_CONTAINERS="api admin media-api" # Conditionally rebuilt if Dockerfile changed CONDITIONAL_CONTAINERS="nginx" # App containers stopped during upgrade APP_CONTAINERS="api admin media-api nginx" # Infrastructure containers (must stay up) INFRA_CONTAINERS="v2-postgres redis" # LSIO containers with anonymous /config volumes (must be force-recreated on upgrade # to prevent stale anonymous volumes from shadowing bind mounts underneath /config) LSIO_VOLUME_CONTAINERS="mkdocs-site-server" # User-modifiable paths (auto-resolve keeps user version on conflict) USER_PATHS=( "mkdocs/docs/" "mkdocs/mkdocs.yml" "mkdocs/site/" "configs/" "nginx/conf.d/services.conf" ) # --- Detect install mode --- if [[ -f "$PROJECT_DIR/VERSION" ]] && [[ ! -d "$PROJECT_DIR/.git" ]]; then INSTALL_MODE="release" else INSTALL_MODE="source" fi # --- Defaults --- SKIP_BACKUP=false PULL_SERVICES=false DRY_RUN=false FORCE=false BRANCH="" ROLLBACK=false API_MODE=false USE_REGISTRY=false # Release installs always use registry mode if [[ "$INSTALL_MODE" == "release" ]]; then USE_REGISTRY=true fi # --- Colors (respects NO_COLOR convention) --- if [[ -t 1 ]] && [[ -z "${NO_COLOR:-}" ]]; then RED='\033[0;31m' GREEN='\033[0;32m' YELLOW='\033[0;33m' BLUE='\033[0;34m' CYAN='\033[0;36m' BOLD='\033[1m' DIM='\033[2m' NC='\033[0m' else RED='' GREEN='' YELLOW='' BLUE='' CYAN='' BOLD='' DIM='' NC='' fi # ============================================================================= # Utility Functions # ============================================================================= info() { echo -e "${CYAN}[INFO]${NC} $*"; } success() { echo -e "${GREEN}[ OK ]${NC} $*"; } warn() { echo -e "${YELLOW}[WARN]${NC} $*"; } error() { echo -e "${RED}[ERR ]${NC} $*" >&2; } phase() { echo "" echo -e "${BOLD}${BLUE}═══════════════════════════════════════════════${NC}" echo -e "${BOLD}${BLUE} Phase $1: $2${NC}" echo -e "${BOLD}${BLUE}═══════════════════════════════════════════════${NC}" echo "" } # --- API mode: JSON progress/result writing --- UPGRADE_DIR="${PROJECT_DIR}/data/upgrade" PROGRESS_FILE="${UPGRADE_DIR}/progress.json" RESULT_FILE="${UPGRADE_DIR}/result.json" write_progress() { local phase_num="$1" phase_name="$2" pct="$3" msg="$4" # Track phase name for on_failure regardless of API_MODE — useful for logs too. CURRENT_PHASE_NAME="$phase_name" [[ "$API_MODE" != "true" ]] && return mkdir -p "$UPGRADE_DIR" cat > "$PROGRESS_FILE" < "$RESULT_FILE" </dev/null || git rev-parse --short HEAD 2>/dev/null || echo "unknown")", "commitCount": ${COMMIT_COUNT:-0}, "durationSeconds": ${duration_secs}, "warnings": ${warnings_json}, "completedAt": "$(date -u +%Y-%m-%dT%H:%M:%SZ)" } REOF # Clean up progress file rm -f "$PROGRESS_FILE" # Update status.json with new commit info if [[ -x "$SCRIPT_DIR/upgrade-check.sh" ]]; then "$SCRIPT_DIR/upgrade-check.sh" 2>/dev/null || true fi } elapsed() { local secs=$((SECONDS - START_TIME)) printf '%dm %ds' $((secs / 60)) $((secs % 60)) } # --- Save/restore user-modifiable paths across git pull --- save_user_paths() { USER_SAVE_DIR="$(mktemp -d)" for p in "${USER_PATHS[@]}"; do if [[ -e "$PROJECT_DIR/$p" ]]; then mkdir -p "$USER_SAVE_DIR/$(dirname "$p")" cp -a "$PROJECT_DIR/$p" "$USER_SAVE_DIR/$p" fi done } restore_user_paths() { if [[ -z "${USER_SAVE_DIR:-}" ]] || [[ ! -d "${USER_SAVE_DIR:-}" ]]; then return fi local restored=0 for p in "${USER_PATHS[@]}"; do if [[ -e "$USER_SAVE_DIR/$p" ]]; then # Ensure parent directory exists (in case pull deleted it) mkdir -p "$PROJECT_DIR/$(dirname "$p")" # Use docker alpine to remove if regular rm fails (root-owned files from containers) rm -rf "$PROJECT_DIR/$p" 2>/dev/null || \ docker run --rm -v "$PROJECT_DIR:/project" alpine rm -rf "/project/$p" 2>/dev/null || true cp -a "$USER_SAVE_DIR/$p" "$PROJECT_DIR/$p" restored=$((restored + 1)) fi done rm -rf "$USER_SAVE_DIR" if [[ $restored -gt 0 ]]; then success "Restored $restored user-modifiable path(s)" fi } # --- Verify Gancio config.json in its data volume --- # Gancio uses a named Docker volume for /home/node/data. If the volume loses # config.json (e.g., volume name prefix mismatch after compose project rename), # Gancio detects an existing DB but no config and refuses to start with: # "Non empty db! Please move your current db elsewhere than retry." # This regenerates config.json from .env vars when missing. verify_gancio_config() { local gancio_volume gancio_volume="$(docker volume ls --format '{{.Name}}' | grep 'gancio-data' | head -1 || true)" if [[ -z "$gancio_volume" ]]; then return # No gancio volume exists yet; first run will handle it fi # Check if config.json exists and is non-empty if docker run --rm -v "${gancio_volume}:/data" alpine test -s /data/config.json 2>/dev/null; then success "Gancio config.json present in $gancio_volume" return fi warn "Gancio config.json missing in volume $gancio_volume — regenerating from .env" local base_url="${GANCIO_BASE_URL:-https://events.cmlite.org}" local pg_user="${V2_POSTGRES_USER:-changemaker}" local pg_pass="${V2_POSTGRES_PASSWORD:-changemaker}" local config_json="{\"baseurl\":\"${base_url}\",\"server\":{\"host\":\"0.0.0.0\",\"port\":13120},\"db\":{\"dialect\":\"postgres\",\"host\":\"changemaker-v2-postgres\",\"port\":5432,\"database\":\"gancio\",\"username\":\"${pg_user}\",\"password\":\"${pg_pass}\"}}" docker run --rm -v "${gancio_volume}:/data" alpine sh -c \ "echo '${config_json}' > /data/config.json && chown 1000:1000 /data/config.json" success "Gancio config.json regenerated" } # --- Lockfile --- acquire_lock() { if [[ -f "$LOCK_FILE" ]]; then local old_pid old_pid=$(cat "$LOCK_FILE" 2>/dev/null || echo "") if [[ -n "$old_pid" ]] && kill -0 "$old_pid" 2>/dev/null; then error "Another upgrade is running (PID $old_pid). If stale, remove $LOCK_FILE" exit 1 fi warn "Removing stale lock file (PID $old_pid no longer running)" rm -f "$LOCK_FILE" fi echo $$ > "$LOCK_FILE" } release_lock() { rm -f "$LOCK_FILE" } # --- .env loading (from backup.sh — handles special chars) --- load_env() { if [[ -f "$PROJECT_DIR/.env" ]]; then while IFS='=' read -r key value; do [[ -z "$key" || "$key" =~ ^[[:space:]]*# ]] && continue key="$(echo "$key" | xargs)" value="${value%\"}" value="${value#\"}" value="${value%\'}" value="${value#\'}" if [[ "$key" =~ ^[a-zA-Z_][a-zA-Z0-9_]*$ ]]; then export "$key=$value" fi done < "$PROJECT_DIR/.env" fi } # --- Print rollback instructions --- print_rollback_help() { local commit="${PRE_UPGRADE_COMMIT:-unknown}" echo "" echo -e "${BOLD}${RED}═══════════════════════════════════════════════${NC}" echo -e "${BOLD}${RED} Upgrade Failed — Rollback Instructions${NC}" echo -e "${BOLD}${RED}═══════════════════════════════════════════════${NC}" echo "" if [[ "$INSTALL_MODE" == "release" ]]; then # Release installs have no .git — rollback is "re-download the prior tarball". # VERSION.rollback is seeded at the start of Phase 3 so we always know what # tag to go back to, across multiple failed attempts. local prior prior="$(cat "${UPGRADE_DIR}/VERSION.rollback" 2>/dev/null | head -1 || echo "vX.Y.Z")" echo -e " ${BOLD}1.${NC} Restore prior release tarball (${BOLD}${prior}${NC}):" echo -e " ${CYAN}cd $PROJECT_DIR${NC}" echo -e " ${CYAN}URL=https://gitea.bnkops.com/admin/changemaker.lite/releases/download/${prior}/changemaker-lite-${prior}.tar.gz${NC}" echo -e " ${CYAN}curl -fSL \"\$URL\" -o /tmp/rb.tar.gz && tar xzf /tmp/rb.tar.gz --strip-components=1 -C $PROJECT_DIR${NC}" echo "" echo -e " ${BOLD}2.${NC} Pull prior images and restart:" echo -e " ${CYAN}docker compose pull api admin media-api nginx${NC}" echo -e " ${CYAN}docker compose up -d${NC}" else echo -e " ${BOLD}1.${NC} Restore code to pre-upgrade commit:" echo -e " ${CYAN}cd $PROJECT_DIR${NC}" echo -e " ${CYAN}git checkout $commit${NC}" echo "" echo -e " ${BOLD}2.${NC} Rebuild and restart:" echo -e " ${CYAN}docker compose build api admin media-api${NC}" echo -e " ${CYAN}docker compose up -d${NC}" fi echo "" echo -e " ${BOLD}3.${NC} If database rollback is needed (destructive!):" echo -e " ${CYAN}# Find backup archive:${NC}" echo -e " ${CYAN}ls -lt $BACKUP_DIR/changemaker-v2-backup-*.tar.gz | head -5${NC}" echo -e " ${CYAN}# Extract and restore:${NC}" echo -e " ${CYAN}tar xzf .tar.gz -C /tmp${NC}" echo -e " ${CYAN}gunzip -c /tmp//v2-postgres.sql.gz | docker exec -i changemaker-v2-postgres psql -U changemaker -d changemaker_v2${NC}" echo "" echo -e " Or use: ${CYAN}./scripts/upgrade.sh --rollback${NC}" echo "" } # --- Failure trap --- # Fires on non-zero exit OR explicit SIGTERM/SIGINT. Writes a truthful # failure result, discards the staged VERSION (Fix B), clears progress so # the admin UI stops showing a frozen phase, and archives to history so # the failure is retrievable. on_failure() { local exit_code=$? # Clean up user path save directory if it exists if [[ -n "${USER_SAVE_DIR:-}" ]] && [[ -d "${USER_SAVE_DIR:-}" ]]; then rm -rf "$USER_SAVE_DIR" fi # Discard staged VERSION — the bump must only happen after full success. rm -f "${UPGRADE_DIR}/VERSION.pending" 2>/dev/null || true release_lock if [[ $exit_code -ne 0 ]] && [[ "$DRY_RUN" != "true" ]]; then local phase_tag="${CURRENT_PHASE_NAME:-unknown phase}" local fail_msg="Upgrade failed during ${phase_tag} at line ${BASH_LINENO[0]} (exit ${exit_code})" error "$fail_msg" # Always write the failure result — previously gated behind API_MODE, # which meant SIGTERM during a watcher-triggered upgrade left stale # success data in result.json. write_result_force "false" "$fail_msg" # Clear progress so the admin UI doesn't show a phantom in-progress phase. rm -f "$PROGRESS_FILE" 2>/dev/null || true # Append to history so the failure is discoverable later. archive_failure_to_history "$fail_msg" print_rollback_help info "Log file: $LOG_FILE" fi } # Same as write_result but bypasses the API_MODE guard. Used by on_failure # to ensure a failure record always lands, even in non-API-mode runs. write_result_force() { local success="$1" msg="$2" local duration_secs=$((SECONDS - ${START_TIME:-SECONDS})) mkdir -p "$UPGRADE_DIR" cat > "$RESULT_FILE" </dev/null || git rev-parse --short HEAD 2>/dev/null || echo "unknown")", "commitCount": ${COMMIT_COUNT:-0}, "durationSeconds": ${duration_secs}, "warnings": ${UPGRADE_WARNINGS:-[]}, "completedAt": "$(date -u +%Y-%m-%dT%H:%M:%SZ)" } REOF } # Append a failure record to history.json (newest first, capped at 50 entries # to match MAX_HISTORY_ENTRIES in api/src/modules/upgrade/upgrade.service.ts). archive_failure_to_history() { _archive_to_history "false" "$1" "[]" } # Mirror for success path — prior code relied on the API's handlePostRestartResult # to archive, which only fires for auto-upgrade post-restart. Admin-UI-triggered # successes were leaking if the user dismissed the result card before the API # polled. API-side archiveResult dedupes on completedAt, so double-append is safe. archive_success_to_history() { _archive_to_history "true" "$1" "${UPGRADE_WARNINGS:-[]}" } _archive_to_history() { local success="$1" msg="$2" warnings_json="$3" local hist="${UPGRADE_DIR}/history.json" mkdir -p "$UPGRADE_DIR" local entry entry="$(cat </dev/null || echo "unknown")","commitCount":${COMMIT_COUNT:-0},"durationSeconds":$((SECONDS - ${START_TIME:-SECONDS})),"warnings":${warnings_json},"completedAt":"$(date -u +%Y-%m-%dT%H:%M:%SZ)"} HEOF )" python3 - "$hist" "$entry" <<'PYEOF' 2>/dev/null || true import json, sys hist_path, entry_json = sys.argv[1], sys.argv[2] try: with open(hist_path) as f: history = json.load(f) if not isinstance(history, list): history = [] except Exception: history = [] history.insert(0, json.loads(entry_json)) history = history[:50] with open(hist_path, 'w') as f: json.dump(history, f, indent=2) PYEOF } # ============================================================================= # Parse Arguments # ============================================================================= show_help() { cat << 'EOF' Changemaker Lite V2 — Upgrade Script Usage: ./scripts/upgrade.sh [OPTIONS] Options: --skip-backup Skip backup phase (requires --force) --pull-services Also pull new third-party Docker images --use-registry Pull pre-built images from Gitea registry instead of rebuilding --dry-run Show what would happen without executing --force Continue past non-critical warnings --branch BRANCH Git branch to pull (default: current branch) --rollback Rollback to pre-upgrade commit --api-mode Write progress/result JSON for admin UI --help Show this help message Examples: ./scripts/upgrade.sh # Standard upgrade (build from source) ./scripts/upgrade.sh --use-registry # Fast upgrade using pre-built Gitea images ./scripts/upgrade.sh --dry-run # Preview changes ./scripts/upgrade.sh --pull-services # Also update PostgreSQL, Redis, etc. ./scripts/upgrade.sh --rollback # Revert last upgrade EOF exit 0 } while [[ $# -gt 0 ]]; do case "$1" in --skip-backup) SKIP_BACKUP=true; shift ;; --pull-services) PULL_SERVICES=true; shift ;; --dry-run) DRY_RUN=true; shift ;; --force) FORCE=true; shift ;; --branch) BRANCH="$2"; shift 2 ;; --rollback) ROLLBACK=true; shift ;; --api-mode) API_MODE=true; shift ;; --use-registry) USE_REGISTRY=true; shift ;; --help|-h) show_help ;; *) error "Unknown option: $1"; echo "Run with --help for usage."; exit 1 ;; esac done # Validate flag combinations if [[ "$SKIP_BACKUP" == "true" ]] && [[ "$FORCE" != "true" ]]; then error "--skip-backup requires --force (backup protects your data)" exit 1 fi # ============================================================================= # Main # ============================================================================= START_TIME=$SECONDS cd "$PROJECT_DIR" # Setup logging mkdir -p "$LOG_DIR" exec > >(tee -a "$LOG_FILE") 2>&1 echo "" echo -e "${BOLD}${BLUE}══════════════════════════════════════════════════${NC}" echo -e "${BOLD}${BLUE} Changemaker Lite V2 — Upgrade${NC}" echo -e "${BOLD}${BLUE} ${TIMESTAMP}${NC}" echo -e "${BOLD}${BLUE}══════════════════════════════════════════════════${NC}" if [[ "$DRY_RUN" == "true" ]]; then echo "" echo -e " ${YELLOW}DRY RUN — no changes will be made${NC}" fi trap on_failure EXIT # Explicit SIGTERM/SIGINT traps: bash runs EXIT on these too in theory, but # the marcelle v2.9.2 → v2.9.3 SIGTERM-kill showed no failure result was # written. Belt-and-suspenders — worst case it fires twice, and write_result # uses `>` so the second write is idempotent. trap on_failure TERM INT acquire_lock load_env # Determine branch (source mode only — release installs have no git) if [[ -z "$BRANCH" ]]; then if [[ "$INSTALL_MODE" == "release" ]]; then BRANCH="release" else BRANCH="$(git rev-parse --abbrev-ref HEAD)" fi fi # ============================================================================= # Rollback Mode # ============================================================================= if [[ "$ROLLBACK" == "true" ]]; then phase "R" "Rollback" if [[ "$INSTALL_MODE" == "release" ]]; then # Release-mode rollback: re-extract the prior release tarball recorded # in VERSION.rollback (seeded at Phase 3 start of any upgrade). PRIOR_TAG="$(cat "${UPGRADE_DIR}/VERSION.rollback" 2>/dev/null | head -1 || true)" if [[ -z "$PRIOR_TAG" ]]; then error "No VERSION.rollback marker found at ${UPGRADE_DIR}/VERSION.rollback" error "Cannot determine prior release. Run: curl -fSL | tar xz -C $PROJECT_DIR --strip-components=1" release_lock exit 1 fi info "Rolling back to prior release: ${PRIOR_TAG}" TARBALL_URL="${GITEA_REGISTRY_URL:-https://gitea.bnkops.com}/admin/changemaker.lite/releases/download/${PRIOR_TAG}/changemaker-lite-${PRIOR_TAG}.tar.gz" if [[ "$DRY_RUN" == "true" ]]; then info "[DRY RUN] Would download: $TARBALL_URL" info "[DRY RUN] Would extract to: $PROJECT_DIR (preserving .env)" info "[DRY RUN] Would run: docker compose pull api admin media-api nginx && docker compose up -d" release_lock exit 0 fi ROLLBACK_DIR="$(mktemp -d)" if ! curl -fSL "$TARBALL_URL" -o "${ROLLBACK_DIR}/rb.tar.gz"; then error "Failed to download prior release tarball from ${TARBALL_URL}" rm -rf "$ROLLBACK_DIR" release_lock exit 1 fi tar xzf "${ROLLBACK_DIR}/rb.tar.gz" -C "$ROLLBACK_DIR" ROLLBACK_SRC="$(find "$ROLLBACK_DIR" -maxdepth 1 -mindepth 1 -type d | head -1)" rsync -a --exclude='.env' "$ROLLBACK_SRC/" "$PROJECT_DIR/" rm -rf "$ROLLBACK_DIR" success "Code rolled back to ${PRIOR_TAG}" export IMAGE_TAG="latest" docker compose pull api admin media-api nginx || warn "Some images failed to pull — check registry reachability" docker compose up -d success "Containers restarted on ${PRIOR_TAG} images" else # Source-mode rollback: legacy git-based flow. LATEST_ARCHIVE="$(ls -t "$BACKUP_DIR"/changemaker-v2-backup-*.tar.gz 2>/dev/null | head -1 || true)" if [[ -z "$LATEST_ARCHIVE" ]]; then error "No backup archives found in $BACKUP_DIR" error "Cannot determine pre-upgrade commit. Manual rollback needed." release_lock exit 1 fi info "Latest backup: $(basename "$LATEST_ARCHIVE")" ARCHIVE_DIR="$(basename "$LATEST_ARCHIVE" .tar.gz)" ROLLBACK_COMMIT="$(tar xzf "$LATEST_ARCHIVE" -O "${ARCHIVE_DIR}/git-commit.txt" 2>/dev/null || true)" if [[ -z "$ROLLBACK_COMMIT" ]]; then error "No git-commit.txt found in backup archive." error "Manually specify: git checkout " release_lock exit 1 fi info "Rolling back to commit: $ROLLBACK_COMMIT" if [[ "$DRY_RUN" == "true" ]]; then info "[DRY RUN] Would run: git checkout $ROLLBACK_COMMIT" info "[DRY RUN] Would rebuild: docker compose build $SOURCE_CONTAINERS" info "[DRY RUN] Would restart: docker compose up -d" release_lock exit 0 fi git checkout -B "$BRANCH" "$ROLLBACK_COMMIT" docker compose build $SOURCE_CONTAINERS docker compose up -d success "Rolled back to $ROLLBACK_COMMIT" echo "" echo -e " ${BOLD}Database restore:${NC}" echo -e " Code has been rolled back. Database was NOT rolled back." echo -e " The backup archive contains a PostgreSQL dump." echo -e " To restore (${RED}DESTRUCTIVE — replaces current data${NC}):" echo "" ARCHIVE_DIR_NAME="$(basename "$LATEST_ARCHIVE" .tar.gz)" echo -e " ${CYAN}tar xzf $LATEST_ARCHIVE -C /tmp${NC}" echo -e " ${CYAN}gunzip -c /tmp/$ARCHIVE_DIR_NAME/v2-postgres.sql.gz | docker exec -i changemaker-v2-postgres psql -U changemaker -d changemaker_v2${NC}" echo "" fi release_lock exit 0 fi # ============================================================================= # Phase 1: Pre-flight Checks # ============================================================================= phase "1" "Pre-flight Checks" write_progress 1 "Pre-flight Checks" 5 "Verifying system requirements..." # Docker if command -v docker &>/dev/null; then success "Docker: $(docker --version | head -1)" else error "Docker is not installed." exit 1 fi if docker compose version &>/dev/null; then success "Docker Compose: $(docker compose version --short)" else error "Docker Compose v2 plugin not found." exit 1 fi # Docker daemon running if docker info &>/dev/null 2>&1; then success "Docker daemon running" else error "Docker daemon not running." exit 1 fi # Git if command -v git &>/dev/null; then success "Git: $(git --version)" else error "Git is not installed." exit 1 fi # Remote reachable (source mode only — release mode pulls from Gitea API later) if [[ "$INSTALL_MODE" == "source" ]]; then info "Checking git remote..." if timeout 10 git ls-remote origin HEAD &>/dev/null 2>&1; then success "Git remote reachable" else error "Cannot reach git remote. Check your network or remote configuration." exit 1 fi fi # Working directory checks if [[ ! -f "$PROJECT_DIR/docker-compose.yml" ]]; then error "docker-compose.yml not found. Are you in the project root?" exit 1 fi if [[ ! -f "$PROJECT_DIR/.env" ]]; then error ".env not found. Run ./config.sh first." exit 1 fi success "Project files verified" # Disk space AVAILABLE_MB=$(df -m "$PROJECT_DIR" | awk 'NR==2 {print $4}') if [[ "$AVAILABLE_MB" -lt "$MIN_DISK_MB" ]]; then error "Insufficient disk space: ${AVAILABLE_MB}MB available, ${MIN_DISK_MB}MB required." exit 1 fi success "Disk space: ${AVAILABLE_MB}MB available" # Record pre-upgrade state if [[ "$INSTALL_MODE" == "source" ]]; then PRE_UPGRADE_COMMIT="$(git rev-parse HEAD)" PRE_UPGRADE_SHORT="$(git rev-parse --short HEAD)" info "Current commit: $PRE_UPGRADE_SHORT ($(git log -1 --format='%s' HEAD))" else # Release mode: derive "commit" from VERSION file (format: \n) PRE_UPGRADE_COMMIT="$(head -2 "$PROJECT_DIR/VERSION" 2>/dev/null | tail -1 || echo "release")" PRE_UPGRADE_SHORT="$(head -1 "$PROJECT_DIR/VERSION" 2>/dev/null || echo "release")" info "Current version: $PRE_UPGRADE_SHORT" fi info "Target branch: $BRANCH" # Record running containers (for restoring monitoring profile later) MONITORING_WAS_RUNNING=false if docker ps --format '{{.Names}}' | grep -q 'prometheus-changemaker'; then MONITORING_WAS_RUNNING=true info "Monitoring stack detected (will restart after upgrade)" fi # Source-mode-only checks: dirty files + upstream commit comparison if [[ "$INSTALL_MODE" == "source" ]]; then # Warn about uncommitted changes in project-owned paths PROJECT_OWNED_PATHS="api/ admin/ docker-compose.yml" DIRTY_PROJECT_FILES="$(git diff --name-only HEAD -- $PROJECT_OWNED_PATHS 2>/dev/null || true)" if [[ -n "$DIRTY_PROJECT_FILES" ]]; then warn "Uncommitted changes in project-owned files:" echo "$DIRTY_PROJECT_FILES" | while read -r f; do echo " $f"; done if [[ "$FORCE" != "true" ]]; then error "Commit or stash these changes first, or use --force to continue." exit 1 fi warn "Continuing with --force (changes will be stashed)" fi # Check for available updates LOCAL_HEAD="$(git rev-parse HEAD)" REMOTE_HEAD="$(git ls-remote origin "$BRANCH" | cut -f1)" if [[ "$LOCAL_HEAD" == "$REMOTE_HEAD" ]]; then info "Already up to date ($PRE_UPGRADE_SHORT). No upstream changes." if [[ "$FORCE" != "true" ]]; then success "Nothing to upgrade." release_lock exit 0 fi warn "Continuing with --force despite no upstream changes." fi fi # Release mode: the upstream-version comparison happens later in the # release-mode block (line ~597) which queries the Gitea Releases API. # ============================================================================= # Phase 2: Backup # ============================================================================= phase "2" "Backup" write_progress 2 "Backup" 15 "Creating backup..." if [[ "$SKIP_BACKUP" == "true" ]]; then warn "Backup skipped (--skip-backup --force)" else # Run existing backup script if [[ -x "$SCRIPT_DIR/backup.sh" ]]; then if [[ "$DRY_RUN" == "true" ]]; then info "[DRY RUN] Would run: scripts/backup.sh" else info "Running database backup..." "$SCRIPT_DIR/backup.sh" success "Database backup complete" fi else warn "scripts/backup.sh not found or not executable, skipping database backup" fi # Archive user-modifiable content USER_BACKUP="${BACKUP_DIR}/upgrade-user-content-${TIMESTAMP}.tar.gz" USER_BACKUP_FILES=() for p in "${USER_PATHS[@]}"; do if [[ -e "$PROJECT_DIR/$p" ]]; then USER_BACKUP_FILES+=("$p") fi done if [[ ${#USER_BACKUP_FILES[@]} -gt 0 ]]; then if [[ "$DRY_RUN" == "true" ]]; then info "[DRY RUN] Would archive user content: ${USER_BACKUP_FILES[*]}" else mkdir -p "$BACKUP_DIR" tar -czf "$USER_BACKUP" -C "$PROJECT_DIR" "${USER_BACKUP_FILES[@]}" 2>/dev/null || true success "User content archived: $(du -h "$USER_BACKUP" | cut -f1)" fi fi # Save pre-upgrade commit hash for rollback reference LATEST_BACKUP="$(ls -t "$BACKUP_DIR"/changemaker-v2-backup-*.tar.gz 2>/dev/null | head -1 || true)" if [[ -n "$LATEST_BACKUP" ]] && [[ "$DRY_RUN" != "true" ]]; then # Append git-commit.txt into the latest backup archive COMMIT_TMPDIR="$(mktemp -d)" ARCHIVE_BASENAME="$(basename "$LATEST_BACKUP" .tar.gz)" mkdir -p "$COMMIT_TMPDIR/$ARCHIVE_BASENAME" echo "$PRE_UPGRADE_COMMIT" > "$COMMIT_TMPDIR/$ARCHIVE_BASENAME/git-commit.txt" # Re-pack: extract, add file, recompress tar xzf "$LATEST_BACKUP" -C "$COMMIT_TMPDIR" 2>/dev/null || true tar czf "$LATEST_BACKUP" -C "$COMMIT_TMPDIR" "$ARCHIVE_BASENAME" rm -rf "$COMMIT_TMPDIR" success "Saved commit reference ($PRE_UPGRADE_SHORT) in backup archive" fi fi # ============================================================================= # Phase 3: Code Update # ============================================================================= phase "3" "Code Update" write_progress 3 "Code Update" 30 "Pulling latest code..." # --- Release mode: download tarball instead of git pull --- if [[ "$INSTALL_MODE" == "release" ]]; then GITEA_API="${GITEA_REGISTRY_URL:-https://gitea.bnkops.com}/api/v1" CURRENT_VERSION=$(head -1 "$PROJECT_DIR/VERSION" 2>/dev/null || echo "unknown") info "Release mode — checking for updates (current: ${CURRENT_VERSION})..." RELEASE_JSON=$(curl -sf "${GITEA_API}/repos/admin/changemaker.lite/releases/latest" 2>/dev/null || true) if [[ -z "$RELEASE_JSON" ]]; then error "Could not reach Gitea API. Check network or GITEA_REGISTRY_URL." exit 1 fi LATEST_TAG=$(echo "$RELEASE_JSON" | python3 -c "import sys,json; print(json.load(sys.stdin).get('tag_name',''))" 2>/dev/null) TARBALL_URL=$(echo "$RELEASE_JSON" | python3 -c " import sys, json for a in json.load(sys.stdin).get('assets', []): if a['name'].endswith('.tar.gz'): print(a['browser_download_url']); break " 2>/dev/null || true) if [[ "$CURRENT_VERSION" == "$LATEST_TAG" ]] && [[ "$FORCE" != "true" ]]; then info "Already at latest version: ${CURRENT_VERSION}" write_progress 3 "Code Update" 45 "Already up to date" elif [[ -z "$TARBALL_URL" ]]; then error "No tarball found in release ${LATEST_TAG}" exit 1 else info "Updating ${CURRENT_VERSION} → ${LATEST_TAG}..." write_progress 3 "Code Update" 35 "Downloading ${LATEST_TAG}..." # Download DOWNLOAD_DIR=$(mktemp -d) curl -fSL "$TARBALL_URL" -o "${DOWNLOAD_DIR}/update.tar.gz" tar xzf "${DOWNLOAD_DIR}/update.tar.gz" -C "$DOWNLOAD_DIR" UPDATE_SRC=$(find "$DOWNLOAD_DIR" -maxdepth 1 -mindepth 1 -type d | head -1) # Save user paths save_user_paths # Sync new files, preserving .env. VERSION is staged to a pending # location and only promoted after Phase 7 verification succeeds (Fix B), # so interrupted upgrades don't leave a misleading "upgraded" marker. # Also stash the CURRENT VERSION as VERSION.rollback so --rollback and # print_rollback_help know what release to restore on failure. write_progress 3 "Code Update" 40 "Applying update..." mkdir -p "$UPGRADE_DIR" if [[ -f "$PROJECT_DIR/VERSION" ]]; then cp "$PROJECT_DIR/VERSION" "$UPGRADE_DIR/VERSION.rollback" fi rsync -a --exclude='.env' --exclude='VERSION' "$UPDATE_SRC/" "$PROJECT_DIR/" cp "$UPDATE_SRC/VERSION" "$UPGRADE_DIR/VERSION.pending" # Restore user paths restore_user_paths # Restore tracked files that may have been overwritten DELETED_TRACKED="$(git ls-files --deleted 2>/dev/null || true)" if [[ -n "$DELETED_TRACKED" ]]; then echo "$DELETED_TRACKED" | xargs git checkout HEAD -- 2>/dev/null || true fi rm -rf "$DOWNLOAD_DIR" success "Updated to ${LATEST_TAG}" fi # Skip the git-based update flow below POST_PULL_COMMIT="$(head -2 "$PROJECT_DIR/VERSION" | tail -1 2>/dev/null || echo "release")" elif [[ "$DRY_RUN" == "true" ]]; then info "[DRY RUN] Would fetch and show incoming changes:" git fetch origin "$BRANCH" 2>/dev/null || true INCOMING="$(git log --oneline HEAD..origin/"$BRANCH" 2>/dev/null || echo "(unable to preview)")" if [[ -n "$INCOMING" ]]; then echo "$INCOMING" else info "No new commits to pull." fi info "[DRY RUN] Would preserve user-modifiable paths: ${USER_PATHS[*]}" info "[DRY RUN] Would stash local changes, pull, and pop stash" release_lock exit 0 fi # Source-mode git pull flow. Release mode handles its update via tarball # download in the block above and skips this entire section. if [[ "$INSTALL_MODE" == "source" ]]; then # Step 0: Save user-modifiable paths before any git operations save_user_paths # Step 0b: Clear skip-worktree flags that prevent merge (e.g., repo-data JSON files) SKIP_WORKTREE_FILES="$(git ls-files -v | grep '^S ' | awk '{print $2}' || true)" if [[ -n "$SKIP_WORKTREE_FILES" ]]; then info "Clearing skip-worktree flags on $(echo "$SKIP_WORKTREE_FILES" | wc -l | xargs) file(s)..." echo "$SKIP_WORKTREE_FILES" | xargs git update-index --no-skip-worktree success "Skip-worktree flags cleared" fi # Step 0c: Fix Docker-owned directories that block git checkout for owned_dir in api/upgrade api/uploads api/configs; do if [[ -d "$PROJECT_DIR/$owned_dir" ]] && [[ ! -w "$PROJECT_DIR/$owned_dir" ]]; then info "Fixing permissions on $owned_dir..." docker run --rm -v "$PROJECT_DIR/$owned_dir:/fix" alpine chown -R "$(id -u):$(id -g)" /fix 2>/dev/null || true fi done # Step 1: Stash user changes if any exist HAS_CHANGES=false if [[ -n "$(git status --porcelain 2>/dev/null)" ]]; then HAS_CHANGES=true STASH_NAME="upgrade-${TIMESTAMP}" info "Stashing local changes as '$STASH_NAME'..." git stash push --include-untracked -m "$STASH_NAME" success "Local changes stashed" fi # Step 3: Pull updates info "Pulling updates from origin/$BRANCH..." if ! git pull origin "$BRANCH" --no-edit 2>&1; then error "git pull failed. This may indicate upstream force-push or branch issues." if [[ "$HAS_CHANGES" == "true" ]]; then warn "Your stashed changes can be recovered with: git stash pop" fi exit 1 fi POST_PULL_COMMIT="$(git rev-parse --short HEAD)" success "Updated to $POST_PULL_COMMIT" # Step 4: Pop stash and handle conflicts if [[ "$HAS_CHANGES" == "true" ]]; then info "Restoring local changes..." if git stash pop 2>&1; then success "Local changes restored cleanly" else warn "Merge conflicts detected during stash pop" # Auto-resolve user-modifiable paths by keeping user's version RESOLVED_COUNT=0 for user_path in "${USER_PATHS[@]}"; do CONFLICTED="$(git diff --name-only --diff-filter=U -- "$user_path" 2>/dev/null || true)" if [[ -n "$CONFLICTED" ]]; then while IFS= read -r cf; do info " Auto-resolving (keeping yours): $cf" git checkout --theirs "$cf" 2>/dev/null || true git add "$cf" RESOLVED_COUNT=$((RESOLVED_COUNT + 1)) done < <(echo "$CONFLICTED") fi done # Check if any conflicts remain in project-owned files REMAINING_CONFLICTS="$(git diff --name-only --diff-filter=U 2>/dev/null || true)" if [[ -n "$REMAINING_CONFLICTS" ]]; then error "Unresolved conflicts in project-owned files:" echo "$REMAINING_CONFLICTS" | while read -r f; do echo " $f"; done echo "" error "These files have upstream changes that conflict with your edits." error "Resolve manually, then run the upgrade again." info "Your pre-upgrade commit: $PRE_UPGRADE_COMMIT" info "To abort: git merge --abort OR git checkout $PRE_UPGRADE_COMMIT" exit 1 fi if [[ $RESOLVED_COUNT -gt 0 ]]; then success "Auto-resolved $RESOLVED_COUNT user-modifiable path(s) (kept your versions)" fi fi fi # Step 4b: Restore user-modifiable paths (unconditionally overwrites with saved copies) restore_user_paths # Step 4c: Restore any tracked files accidentally deleted by restore_user_paths # (can happen when save_user_paths can't read root-owned files in user paths) DELETED_TRACKED="$(git ls-files --deleted 2>/dev/null || true)" if [[ -n "$DELETED_TRACKED" ]]; then info "Restoring $(echo "$DELETED_TRACKED" | wc -l | xargs) tracked file(s) deleted during restore..." echo "$DELETED_TRACKED" | xargs git checkout HEAD -- 2>/dev/null || true success "Tracked files restored from HEAD" fi fi # End of source-mode git pull flow # Step 5: Detect new env vars info "Checking for new environment variables..." if [[ -f "$PROJECT_DIR/.env.example" ]] && [[ -f "$PROJECT_DIR/.env" ]]; then NEW_VARS=() while IFS='=' read -r key value; do [[ -z "$key" || "$key" =~ ^[[:space:]]*# ]] && continue key="$(echo "$key" | xargs)" [[ ! "$key" =~ ^[a-zA-Z_][a-zA-Z0-9_]*$ ]] && continue if ! grep -q "^${key}=" "$PROJECT_DIR/.env" 2>/dev/null; then # Strip inline comments and trim whitespace before appending value="${value%%#*}" value="$(echo "$value" | xargs)" echo "${key}=${value}" >> "$PROJECT_DIR/.env" NEW_VARS+=("$key") fi done < "$PROJECT_DIR/.env.example" if [[ ${#NEW_VARS[@]} -gt 0 ]]; then warn "New env vars added to .env (review defaults):" for v in "${NEW_VARS[@]}"; do echo -e " ${CYAN}$v${NC}" done else success "No new environment variables" fi fi # Step 6: Print update summary (source mode only — release mode has no commit range) COMMIT_COUNT=0 if [[ "$INSTALL_MODE" == "source" ]]; then COMMIT_RANGE="${PRE_UPGRADE_SHORT}..${POST_PULL_COMMIT}" # Use || true and check pipefail-safe to survive git failures COMMIT_COUNT="$(git log --oneline "$PRE_UPGRADE_COMMIT..HEAD" 2>/dev/null | wc -l | xargs || echo 0)" echo "" info "Update summary: $COMMIT_COUNT commit(s) ($COMMIT_RANGE)" git log --oneline "$PRE_UPGRADE_COMMIT..HEAD" 2>/dev/null | head -20 || true if [[ "$COMMIT_COUNT" -gt 20 ]]; then info " ... and $((COMMIT_COUNT - 20)) more" fi # Flag commits that may require manual attention BREAKING_COMMITS="$(git log --oneline "$PRE_UPGRADE_COMMIT..HEAD" --grep="BREAKING" --grep="\[manual\]" 2>/dev/null || true)" if [[ -n "$BREAKING_COMMITS" ]]; then echo "" warn "Commits requiring manual attention:" echo "$BREAKING_COMMITS" | while read -r line; do echo -e " ${YELLOW}$line${NC}" done fi else info "Update summary: ${PRE_UPGRADE_SHORT} → release" fi # ============================================================================= # Phase 4: Container Rebuild # ============================================================================= phase "4" "Container Rebuild" write_progress 4 "Container Rebuild" 50 "Preparing containers..." CHANGED_FILES="$(git diff --name-only "$PRE_UPGRADE_COMMIT..HEAD" 2>/dev/null || true)" if [[ "$USE_REGISTRY" == "true" ]]; then # --- Registry pull path: pull pre-built production images from Gitea --- REGISTRY="${GITEA_REGISTRY:-gitea.bnkops.com/admin}" REGISTRY_TAG="$(git rev-parse --short HEAD 2>/dev/null || echo "latest")" export GITEA_REGISTRY="$REGISTRY" export IMAGE_TAG="$REGISTRY_TAG" export BUILD_TARGET=production info "Registry mode: ${REGISTRY} (tag: ${REGISTRY_TAG})" write_progress 4 "Container Rebuild" 55 "Pulling images from registry..." # Pull core app containers: try SHA tag → :latest fallback → source build # NOTE: stderr intentionally flows through so slow/broken pulls are visible # in logs/upgrade-watcher.log. Previously silenced, which left the v2.9.3 # systemd-killed upgrade with zero error trace. PULLED_TAG="" if docker compose pull api admin media-api; then success "Core images pulled from registry (tag: ${REGISTRY_TAG})" PULLED_TAG="$REGISTRY_TAG" elif [[ "$REGISTRY_TAG" != "latest" ]]; then warn "Tag :${REGISTRY_TAG} not in registry — trying :latest" export IMAGE_TAG="latest" if docker compose pull api admin media-api; then success "Core images pulled from registry (tag: latest)" PULLED_TAG="latest" # Retag :latest as :SHA so compose up uses consistent tags for svc in api admin media-api; do local_img="${REGISTRY}/changemaker-${svc}" docker tag "${local_img}:latest" "${local_img}:${REGISTRY_TAG}" 2>/dev/null || true done export IMAGE_TAG="$REGISTRY_TAG" else warn "Registry pull failed for :latest too — falling back to source build" export IMAGE_TAG="$REGISTRY_TAG" docker compose build $SOURCE_CONTAINERS success "Source containers rebuilt (registry fallback)" fi else warn "Registry pull failed — falling back to source build" docker compose build $SOURCE_CONTAINERS success "Source containers rebuilt (registry fallback)" fi # nginx: try SHA → :latest → rebuild if config changed NGINX_PULLED=false if docker compose pull nginx; then success "nginx pulled from registry (tag: ${IMAGE_TAG})" NGINX_PULLED=true elif [[ "$REGISTRY_TAG" != "latest" ]]; then export IMAGE_TAG="latest" if docker compose pull nginx; then docker tag "${REGISTRY}/changemaker-nginx:latest" "${REGISTRY}/changemaker-nginx:${REGISTRY_TAG}" 2>/dev/null || true success "nginx pulled from registry (tag: latest)" NGINX_PULLED=true fi export IMAGE_TAG="$REGISTRY_TAG" fi if [[ "$NGINX_PULLED" == "false" ]]; then if echo "$CHANGED_FILES" | grep -q "^nginx/"; then info "Rebuilding nginx (config changed, not in registry)..." docker compose build nginx success "nginx rebuilt" else info "nginx unchanged, skipping rebuild" fi fi else # --- Source build path (original behaviour) --- info "Rebuilding source containers: $SOURCE_CONTAINERS" docker compose build $SOURCE_CONTAINERS success "Source containers rebuilt" # Conditionally rebuild containers whose Dockerfiles changed for svc in $CONDITIONAL_CONTAINERS; do case "$svc" in nginx) if echo "$CHANGED_FILES" | grep -q "^nginx/"; then info "Rebuilding nginx (config changed)..." docker compose build nginx success "nginx rebuilt" else info "nginx unchanged, skipping rebuild" fi ;; esac done fi # Optionally pull third-party images if [[ "$PULL_SERVICES" == "true" ]]; then info "Pulling latest third-party images..." docker compose pull v2-postgres redis listmonk-app listmonk-db gitea-app nocodb-v2 mailhog || true success "Third-party images updated" # Record image digests for audit trail info "Recording image digests for audit trail..." docker compose images --format json 2>/dev/null | \ python3 -c "import sys,json; [print(f' {i[\"Repository\"]}:{i[\"Tag\"]} -> {i[\"ID\"][:12]}') for i in json.load(sys.stdin)]" \ 2>/dev/null || true fi # ============================================================================= # Phase 5: Database Migration # ============================================================================= phase "5" "Database Migration" write_progress 5 "Database Migration" 55 "Checking database state..." # Ensure infrastructure is running and healthy info "Ensuring infrastructure is up..." docker compose up -d $INFRA_CONTAINERS # Wait for PostgreSQL to be ready info "Waiting for PostgreSQL..." PG_WAIT=0 PG_TIMEOUT=60 while ! docker compose exec -T v2-postgres pg_isready -U "${V2_POSTGRES_USER:-changemaker}" &>/dev/null 2>&1; do sleep 2 PG_WAIT=$((PG_WAIT + 2)) if [[ $PG_WAIT -ge $PG_TIMEOUT ]]; then error "PostgreSQL did not become ready within ${PG_TIMEOUT}s" exit 1 fi done success "PostgreSQL ready (${PG_WAIT}s)" # Check for failed/incomplete migrations info "Checking for failed migrations..." FAILED_MIGRATIONS="$(docker compose exec -T v2-postgres psql -U "${V2_POSTGRES_USER:-changemaker}" -d "${V2_POSTGRES_DB:-changemaker_v2}" -t -A -c " SELECT migration_name FROM _prisma_migrations WHERE rolled_back_at IS NOT NULL OR (finished_at IS NULL AND started_at IS NOT NULL AND started_at < NOW() - INTERVAL '10 minutes') " 2>/dev/null || true)" if [[ -n "$FAILED_MIGRATIONS" ]]; then warn "Found failed/incomplete migrations — auto-resolving..." while IFS= read -r migration_name; do [[ -z "$migration_name" ]] && continue info " Resolving: $migration_name" docker compose run --rm --no-deps --entrypoint "" api \ npx prisma migrate resolve --applied "$migration_name" 2>&1 || { warn " Could not auto-resolve $migration_name (may need manual intervention)" } done <<< "$FAILED_MIGRATIONS" success "Failed migrations resolved" else success "No failed migrations found" fi # Preview pending migrations before applying info "Checking pending migrations..." PENDING_OUTPUT="$(docker compose run --rm --no-deps --entrypoint "" api \ npx prisma migrate status 2>&1 || true)" if echo "$PENDING_OUTPUT" | grep -q "Following migration"; then info "Pending migrations to apply:" echo "$PENDING_OUTPUT" | grep -E "^\s+[0-9]" | while read -r line; do echo " $line" done fi # Run migrations in a one-off container (catches errors here, not in a restart loop) info "Running database migrations..." write_progress 5 "Database Migration" 60 "Applying migrations..." if ! docker compose run --rm --no-deps --entrypoint "" api \ npx prisma migrate deploy 2>&1; then error "Database migration failed!" error "" error "Common fixes:" error " 1. Check migration status:" error " docker compose exec v2-postgres psql -U changemaker -d changemaker_v2 \\" error " -c \"SELECT migration_name, finished_at, rolled_back_at FROM _prisma_migrations ORDER BY started_at DESC LIMIT 10;\"" error " 2. Mark a stuck migration as applied:" error " docker compose run --rm --no-deps --entrypoint '' api npx prisma migrate resolve --applied " error " 3. Check logs: docker compose logs api --tail 50" error "" error "After fixing, re-run: ./scripts/upgrade.sh --force --skip-backup" exit 1 fi # Count applied migrations MIGRATION_COUNT="$(docker compose exec -T v2-postgres psql -U "${V2_POSTGRES_USER:-changemaker}" -d "${V2_POSTGRES_DB:-changemaker_v2}" -t -A -c " SELECT COUNT(*) FROM _prisma_migrations WHERE finished_at IS NOT NULL " 2>/dev/null || echo "?")" success "Migrations up to date ($MIGRATION_COUNT total applied)" # Run database seed (idempotent) info "Running database seed..." write_progress 5 "Database Migration" 65 "Seeding database..." if ! docker compose run --rm --no-deps --entrypoint "" api \ npx prisma db seed 2>&1; then warn "Database seed had warnings (non-fatal, continuing)" fi success "Database seed complete" # Verify migration state is clean (no drift) info "Verifying migration state..." MIGRATE_STATUS="$(docker compose run --rm --no-deps --entrypoint "" api \ npx prisma migrate status 2>&1 || true)" if echo "$MIGRATE_STATUS" | grep -qiE "failed|drift|out of sync"; then error "Schema drift detected after migration!" echo "$MIGRATE_STATUS" exit 1 fi success "Schema state verified — no drift" # ============================================================================= # Phase 6: Service Restart # ============================================================================= phase "6" "Service Restart" write_progress 6 "Service Restart" 70 "Restarting services..." # Graceful shutdown with extended drain period (allow in-flight requests to complete) STOP_TIMEOUT=30 info "Stopping application containers (${STOP_TIMEOUT}s grace period)..." docker compose stop -t $STOP_TIMEOUT $APP_CONTAINERS 2>/dev/null || true success "Application containers stopped" # Force-recreate LSIO containers to prevent anonymous volume shadowing bind mounts. # LSIO images define a VOLUME at /config in their Dockerfile. When a container is # merely restarted, Docker reuses the old anonymous volume whose /config/www is empty, # which shadows the bind mount (e.g., ./mkdocs/site:/config/www → 403 Forbidden). # Removing the container first ensures a fresh anonymous volume that respects bind mounts. info "Removing LSIO containers (clearing anonymous volumes)..." docker compose rm -sf $LSIO_VOLUME_CONTAINERS 2>/dev/null || true success "LSIO containers cleared for fresh recreation" # Verify Gancio config.json exists before starting services verify_gancio_config # Detect if npm dependencies changed (stale anonymous volumes cause missing modules) NEEDS_VOLUME_REFRESH=false if echo "$CHANGED_FILES" | grep -qE "^(api|admin)/(package\.json|package-lock\.json)"; then NEEDS_VOLUME_REFRESH=true warn "Package dependencies changed — will recreate containers with fresh volumes" fi # Start API (migrations already applied in Phase 5) info "Starting API..." if [[ "$NEEDS_VOLUME_REFRESH" == "true" ]]; then info "Removing old API/admin containers (clearing stale node_modules volumes)..." docker compose rm -sfv api admin 2>/dev/null || true fi docker compose up -d api # Poll API health check info "Waiting for API health check..." API_WAIT=0 while true; do if docker compose exec -T api wget -q --spider http://localhost:4000/api/health 2>/dev/null; then break fi # Detect container crash early (don't wait full timeout) if ! docker compose ps api --format '{{.State}}' 2>/dev/null | grep -q "running"; then error "API container exited unexpectedly" docker compose logs api --tail 20 exit 1 fi sleep $HEALTH_INTERVAL API_WAIT=$((API_WAIT + HEALTH_INTERVAL)) if [[ $API_WAIT -ge $HEALTH_TIMEOUT ]]; then error "API did not become healthy within ${HEALTH_TIMEOUT}s" error "Check logs: docker compose logs api --tail 50" exit 1 fi done success "API healthy (${API_WAIT}s)" # Start everything else (exclude one-shot init containers) info "Starting remaining services..." docker compose up -d \ --scale listmonk-init=0 \ --scale gancio-init=0 \ --scale vaultwarden-init=0 success "All services started" # Restart Pangolin tunnel connector if running (may hold stale state after nginx rebuild) if docker ps --format '{{.Names}}' | grep -q 'newt'; then info "Restarting Pangolin tunnel connector..." docker compose restart newt 2>/dev/null || true success "Newt tunnel restarted" fi # Restart monitoring if it was running before if [[ "$MONITORING_WAS_RUNNING" == "true" ]]; then info "Restarting monitoring stack..." if docker compose --profile monitoring up -d 2>&1; then success "Monitoring stack restarted" else warn "Monitoring stack restart had errors (non-fatal, services may already be running)" fi fi # ============================================================================= # Phase 7: Post-Upgrade Verification # ============================================================================= phase "7" "Post-Upgrade Verification" write_progress 7 "Verification" 90 "Running health checks..." VERIFY_FAILED=false # Polling health check helper (retries for up to MAX_WAIT seconds) verify_service_health() { local name="$1" check_cmd="$2" max_wait="${3:-30}" local waited=0 while [[ $waited -lt $max_wait ]]; do if eval "$check_cmd" 2>/dev/null; then success "$name: healthy (${waited}s)" return 0 fi sleep 3 waited=$((waited + 3)) done warn "$name: not responding after ${max_wait}s" VERIFY_FAILED=true # Always return 0 — under set -e a non-zero return from this helper would # exit the script before write_result runs. The VERIFY_FAILED flag is the # signal the caller actually checks. return 0 } # API health (with polling — may still be running migrations) verify_service_health "API (port 4000)" \ "docker compose exec -T api wget -q --spider http://localhost:4000/api/health" 45 # Admin health — 90s matches the admin container's start_period + a cushion # for first-boot Vite bundling. 30s was aspirational and produced cry-wolf # warnings on every successful upgrade. verify_service_health "Admin (port 3000)" \ "docker compose exec -T admin wget -q --spider http://localhost:3000/" 90 # Media API health (optional — may not be enabled) if docker ps --format '{{.Names}}' | grep -q 'changemaker-media-api'; then verify_service_health "Media API (port 4100)" \ "docker compose exec -T media-api wget -q --spider http://127.0.0.1:4100/health" 30 fi # Gancio health (optional) — restart loop is still a hard signal, but # "starting" now gets retry grace instead of passing silently. if docker ps --format '{{.Names}}' | grep -q 'gancio-changemaker'; then if docker compose ps gancio --format '{{.Status}}' 2>/dev/null | grep -qi "restarting"; then warn "Gancio: restart loop detected (check config.json in gancio-data volume)" VERIFY_FAILED=true else verify_service_health "Gancio" \ "docker compose ps gancio --format '{{.Status}}' 2>/dev/null | grep -q healthy" 60 fi fi # MkDocs static site health (retry — first-boot rebuild can lag) if docker ps --format '{{.Names}}' | grep -q 'mkdocs-site-server'; then verify_service_health "MkDocs site (port ${MKDOCS_SITE_SERVER_PORT:-4004})" \ "curl -sf http://localhost:${MKDOCS_SITE_SERVER_PORT:-4004}/ -o /dev/null" 30 fi # Check for containers in restart loop RESTARTING="$(docker compose ps 2>/dev/null | grep -i "restarting" || true)" if [[ -n "$RESTARTING" ]]; then warn "Containers in restart loop:" echo "$RESTARTING" VERIFY_FAILED=true fi if [[ "$VERIFY_FAILED" == "true" ]]; then warn "Some health checks failed. Services may still be starting." info "Check logs: docker compose logs --tail 50" UPGRADE_WARNINGS='["Some health checks failed after upgrade — services may still be starting"]' else success "All health checks passed" fi # --- External reachability probe (Fix C) --- # Non-fatal: a Pangolin resource misassignment or DNS flap wouldn't be # caught by the localhost-only checks above. Warn (don't fail) because # transient tunnel issues should not roll back a successful upgrade. if [[ -n "${DOMAIN:-}" ]] && command -v curl >/dev/null 2>&1; then info "Probing external API at https://api.${DOMAIN}/api/health ..." EXT_CODE="$(curl -s -o /dev/null -w '%{http_code}' --max-time 10 \ "https://api.${DOMAIN}/api/health" 2>/dev/null || echo "000")" if [[ "$EXT_CODE" == "200" ]]; then success "External API reachable (HTTP 200)" else warn "External API probe returned HTTP ${EXT_CODE} — check Pangolin tunnel" UPGRADE_WARNINGS="$(python3 - "$UPGRADE_WARNINGS" "$DOMAIN" "$EXT_CODE" <<'PYEOF' 2>/dev/null || echo "$UPGRADE_WARNINGS" import json, sys try: w = json.loads(sys.argv[1]) if sys.argv[1] else [] except Exception: w = [] if not isinstance(w, list): w = [] w.append(f"External API https://api.{sys.argv[2]}/api/health returned HTTP {sys.argv[3]}") print(json.dumps(w)) PYEOF )" fi fi # --- Atomic VERSION promotion (Fix B) --- # The staged VERSION from Phase 3 lands now that we've reached the end of # Phase 7 without on_failure firing. Promote regardless of VERIFY_FAILED — # that flag is a soft health-check warning (e.g. "admin slow to respond"), # not an upgrade failure. The tarball is extracted, containers are up, and # write_result below will record success=true. Gating promotion on # VERIFY_FAILED previously caused a "stuck at old VERSION" bug where a # transient admin healthcheck warning pinned the install back. # Hard failures (SIGTERM, exit !=0) still prevent promotion via on_failure, # which rm -f's VERSION.pending before it can be promoted. if [[ -f "$UPGRADE_DIR/VERSION.pending" ]]; then mv "$UPGRADE_DIR/VERSION.pending" "$PROJECT_DIR/VERSION" success "VERSION promoted to $(head -1 "$PROJECT_DIR/VERSION" 2>/dev/null || echo "?")" fi # ============================================================================= # Summary # ============================================================================= ELAPSED="$(elapsed)" if [[ "$INSTALL_MODE" == "source" ]]; then FINAL_COMMIT="$(git rev-parse --short HEAD)" else FINAL_COMMIT="$(head -1 "$PROJECT_DIR/VERSION" 2>/dev/null || echo "release")" fi write_progress 7 "Verification" 100 "Upgrade complete!" write_result "true" "Upgraded ${PRE_UPGRADE_SHORT} → ${FINAL_COMMIT} (${COMMIT_COUNT} commits)" "$UPGRADE_WARNINGS" archive_success_to_history "Upgraded ${PRE_UPGRADE_SHORT} → ${FINAL_COMMIT} (${COMMIT_COUNT} commits)" echo "" echo -e "${BOLD}${GREEN}══════════════════════════════════════════════════${NC}" echo -e "${BOLD}${GREEN} Upgrade Complete${NC}" echo -e "${BOLD}${GREEN}══════════════════════════════════════════════════${NC}" echo "" echo -e " ${BOLD}Previous:${NC} $PRE_UPGRADE_SHORT" if [[ "$INSTALL_MODE" == "source" ]]; then echo -e " ${BOLD}Current:${NC} $FINAL_COMMIT ($(git log -1 --format='%s' HEAD 2>/dev/null || echo "$FINAL_COMMIT"))" else echo -e " ${BOLD}Current:${NC} $FINAL_COMMIT" fi echo -e " ${BOLD}Commits:${NC} $COMMIT_COUNT" echo -e " ${BOLD}Duration:${NC} $ELAPSED" echo -e " ${BOLD}Log:${NC} $LOG_FILE" echo "" release_lock trap - EXIT