From 47704667b14aa99e68fd4ef95ebdb7ed112594c3 Mon Sep 17 00:00:00 2001 From: bunker-admin Date: Wed, 15 Apr 2026 16:13:04 -0600 Subject: [PATCH] Upgrade failure visibility + atomic VERSION + external smoke test MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Three fixes to harden the admin-UI upgrade path, all in scripts/upgrade.sh. Root-caused by yesterday's v2.9.2 → v2.9.3 on marcelle which was killed by systemd mid-Phase-4 and left the system in a misleading half-upgraded state (VERSION bumped, container pre-upgrade, result.json stale from 24h prior). - Fix A (failure visibility): stop silencing stderr on the five docker compose pull sites so timeouts / auth failures / network errors flow into upgrade-watcher.log. Add explicit SIGTERM/SIGINT traps alongside the existing EXIT trap. Track CURRENT_PHASE_NAME globally so the failure message reports "during Phase 4: Container Rebuild" rather than just an exit code. Introduce write_result_force (bypasses API_MODE guard) + archive_failure_to_history so a killed upgrade always leaves a truthful result.json + history.json entry, and the progress.json is cleared so the admin UI stops showing a phantom in-progress phase. - Fix B (atomic VERSION): Phase 3 rsync now --excludes VERSION and stashes the new one at data/upgrade/VERSION.pending. Phase 7 promotes it to VERSION only after VERIFY_FAILED stays false. on_failure deletes the pending file. upgrade-check.sh needs no changes — its head -1 VERSION read sees actual state instead of a mid-upgrade promise. - Fix C (external smoke): after Phase 7 localhost checks, curl https://api.${DOMAIN}/api/health with --max-time 10 and warn (not fail) on non-200. Catches Pangolin resource misassignments that the localhost-only checks miss. Appends to UPGRADE_WARNINGS so the admin UI surfaces it in result.json. Bunker Admin --- scripts/upgrade.sh | 149 ++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 133 insertions(+), 16 deletions(-) diff --git a/scripts/upgrade.sh b/scripts/upgrade.sh index c2a0ce93..0ec560f5 100755 --- a/scripts/upgrade.sh +++ b/scripts/upgrade.sh @@ -18,6 +18,14 @@ HEALTH_TIMEOUT=120 HEALTH_INTERVAL=5 MIN_DISK_MB=2048 +# Tracks which phase the upgrade is currently in, so on_failure can report +# "killed during Phase 4: Container Rebuild" instead of just an exit code. +CURRENT_PHASE_NAME="" +# Warnings accumulated during the run — surfaced in result.json. Global so +# Phase 7 probes (external reachability) can append without losing earlier +# entries set in the Phase 7 verification block. +UPGRADE_WARNINGS="[]" + # Source-built containers (always rebuilt) SOURCE_CONTAINERS="api admin media-api" # Conditionally rebuilt if Dockerfile changed @@ -93,8 +101,10 @@ PROGRESS_FILE="${UPGRADE_DIR}/progress.json" RESULT_FILE="${UPGRADE_DIR}/result.json" write_progress() { - [[ "$API_MODE" != "true" ]] && return local phase_num="$1" phase_name="$2" pct="$3" msg="$4" + # Track phase name for on_failure regardless of API_MODE — useful for logs too. + CURRENT_PHASE_NAME="$phase_name" + [[ "$API_MODE" != "true" ]] && return mkdir -p "$UPGRADE_DIR" cat > "$PROGRESS_FILE" </dev/null || true release_lock if [[ $exit_code -ne 0 ]] && [[ "$DRY_RUN" != "true" ]]; then - error "Upgrade failed at line ${BASH_LINENO[0]} (exit code $exit_code)" - write_result "false" "Upgrade failed at line ${BASH_LINENO[0]} (exit code ${exit_code})" + local phase_tag="${CURRENT_PHASE_NAME:-unknown phase}" + local fail_msg="Upgrade failed during ${phase_tag} at line ${BASH_LINENO[0]} (exit ${exit_code})" + error "$fail_msg" + # Always write the failure result — previously gated behind API_MODE, + # which meant SIGTERM during a watcher-triggered upgrade left stale + # success data in result.json. + write_result_force "false" "$fail_msg" + # Clear progress so the admin UI doesn't show a phantom in-progress phase. + rm -f "$PROGRESS_FILE" 2>/dev/null || true + # Append to history so the failure is discoverable later. + archive_failure_to_history "$fail_msg" print_rollback_help info "Log file: $LOG_FILE" fi } +# Same as write_result but bypasses the API_MODE guard. Used by on_failure +# to ensure a failure record always lands, even in non-API-mode runs. +write_result_force() { + local success="$1" msg="$2" + local duration_secs=$((SECONDS - ${START_TIME:-SECONDS})) + mkdir -p "$UPGRADE_DIR" + cat > "$RESULT_FILE" </dev/null || git rev-parse --short HEAD 2>/dev/null || echo "unknown")", + "commitCount": ${COMMIT_COUNT:-0}, + "durationSeconds": ${duration_secs}, + "warnings": ${UPGRADE_WARNINGS:-[]}, + "completedAt": "$(date -u +%Y-%m-%dT%H:%M:%SZ)" +} +REOF +} + +# Append a failure record to history.json (newest first, capped at 50 entries +# to match MAX_HISTORY_ENTRIES in api/src/modules/upgrade/upgrade.service.ts). +archive_failure_to_history() { + local msg="$1" + local hist="${UPGRADE_DIR}/history.json" + mkdir -p "$UPGRADE_DIR" + local entry + entry="$(cat </dev/null || echo "unknown")","commitCount":${COMMIT_COUNT:-0},"durationSeconds":$((SECONDS - ${START_TIME:-SECONDS})),"warnings":[],"completedAt":"$(date -u +%Y-%m-%dT%H:%M:%SZ)"} +HEOF + )" + python3 - "$hist" "$entry" <<'PYEOF' 2>/dev/null || true +import json, sys, os +hist_path, entry_json = sys.argv[1], sys.argv[2] +try: + with open(hist_path) as f: + history = json.load(f) + if not isinstance(history, list): + history = [] +except Exception: + history = [] +history.insert(0, json.loads(entry_json)) +history = history[:50] +with open(hist_path, 'w') as f: + json.dump(history, f, indent=2) +PYEOF +} + # ============================================================================= # Parse Arguments # ============================================================================= @@ -356,6 +429,11 @@ if [[ "$DRY_RUN" == "true" ]]; then fi trap on_failure EXIT +# Explicit SIGTERM/SIGINT traps: bash runs EXIT on these too in theory, but +# the marcelle v2.9.2 → v2.9.3 SIGTERM-kill showed no failure result was +# written. Belt-and-suspenders — worst case it fires twice, and write_result +# uses `>` so the second write is idempotent. +trap on_failure TERM INT acquire_lock load_env @@ -652,9 +730,13 @@ for a in json.load(sys.stdin).get('assets', []): # Save user paths save_user_paths - # Sync new files, preserving .env + # Sync new files, preserving .env. VERSION is staged to a pending + # location and only promoted after Phase 7 verification succeeds (Fix B), + # so interrupted upgrades don't leave a misleading "upgraded" marker. write_progress 3 "Code Update" 40 "Applying update..." - rsync -a --exclude='.env' "$UPDATE_SRC/" "$PROJECT_DIR/" + rsync -a --exclude='.env' --exclude='VERSION' "$UPDATE_SRC/" "$PROJECT_DIR/" + mkdir -p "$UPGRADE_DIR" + cp "$UPDATE_SRC/VERSION" "$UPGRADE_DIR/VERSION.pending" # Restore user paths restore_user_paths @@ -861,14 +943,17 @@ if [[ "$USE_REGISTRY" == "true" ]]; then write_progress 4 "Container Rebuild" 55 "Pulling images from registry..." # Pull core app containers: try SHA tag → :latest fallback → source build + # NOTE: stderr intentionally flows through so slow/broken pulls are visible + # in logs/upgrade-watcher.log. Previously silenced, which left the v2.9.3 + # systemd-killed upgrade with zero error trace. PULLED_TAG="" - if docker compose pull api admin media-api 2>/dev/null; then + if docker compose pull api admin media-api; then success "Core images pulled from registry (tag: ${REGISTRY_TAG})" PULLED_TAG="$REGISTRY_TAG" elif [[ "$REGISTRY_TAG" != "latest" ]]; then warn "Tag :${REGISTRY_TAG} not in registry — trying :latest" export IMAGE_TAG="latest" - if docker compose pull api admin media-api 2>/dev/null; then + if docker compose pull api admin media-api; then success "Core images pulled from registry (tag: latest)" PULLED_TAG="latest" # Retag :latest as :SHA so compose up uses consistent tags @@ -891,12 +976,12 @@ if [[ "$USE_REGISTRY" == "true" ]]; then # nginx: try SHA → :latest → rebuild if config changed NGINX_PULLED=false - if docker compose pull nginx 2>/dev/null; then + if docker compose pull nginx; then success "nginx pulled from registry (tag: ${IMAGE_TAG})" NGINX_PULLED=true elif [[ "$REGISTRY_TAG" != "latest" ]]; then export IMAGE_TAG="latest" - if docker compose pull nginx 2>/dev/null; then + if docker compose pull nginx; then docker tag "${REGISTRY}/changemaker-nginx:latest" "${REGISTRY}/changemaker-nginx:${REGISTRY_TAG}" 2>/dev/null || true success "nginx pulled from registry (tag: latest)" NGINX_PULLED=true @@ -938,7 +1023,7 @@ fi # Optionally pull third-party images if [[ "$PULL_SERVICES" == "true" ]]; then info "Pulling latest third-party images..." - docker compose pull v2-postgres redis listmonk-app listmonk-db gitea-app nocodb-v2 mailhog 2>/dev/null || true + docker compose pull v2-postgres redis listmonk-app listmonk-db gitea-app nocodb-v2 mailhog || true success "Third-party images updated" # Record image digests for audit trail @@ -1217,10 +1302,48 @@ fi if [[ "$VERIFY_FAILED" == "true" ]]; then warn "Some health checks failed. Services may still be starting." info "Check logs: docker compose logs --tail 50" + UPGRADE_WARNINGS='["Some health checks failed after upgrade — services may still be starting"]' else success "All health checks passed" fi +# --- External reachability probe (Fix C) --- +# Non-fatal: a Pangolin resource misassignment or DNS flap wouldn't be +# caught by the localhost-only checks above. Warn (don't fail) because +# transient tunnel issues should not roll back a successful upgrade. +if [[ -n "${DOMAIN:-}" ]] && command -v curl >/dev/null 2>&1; then + info "Probing external API at https://api.${DOMAIN}/api/health ..." + EXT_CODE="$(curl -s -o /dev/null -w '%{http_code}' --max-time 10 \ + "https://api.${DOMAIN}/api/health" 2>/dev/null || echo "000")" + if [[ "$EXT_CODE" == "200" ]]; then + success "External API reachable (HTTP 200)" + else + warn "External API probe returned HTTP ${EXT_CODE} — check Pangolin tunnel" + UPGRADE_WARNINGS="$(python3 - "$UPGRADE_WARNINGS" "$DOMAIN" "$EXT_CODE" <<'PYEOF' 2>/dev/null || echo "$UPGRADE_WARNINGS" +import json, sys +try: + w = json.loads(sys.argv[1]) if sys.argv[1] else [] +except Exception: + w = [] +if not isinstance(w, list): + w = [] +w.append(f"External API https://api.{sys.argv[2]}/api/health returned HTTP {sys.argv[3]}") +print(json.dumps(w)) +PYEOF +)" + fi +fi + +# --- Atomic VERSION promotion (Fix B) --- +# The staged VERSION from Phase 3 lands only now, after full verification. +# On any prior failure, on_failure removes VERSION.pending and the live +# VERSION file remains at the pre-upgrade value — so upgrade-check.sh +# correctly reports "upgrade available" on the next check. +if [[ "$VERIFY_FAILED" != "true" ]] && [[ -f "$UPGRADE_DIR/VERSION.pending" ]]; then + mv "$UPGRADE_DIR/VERSION.pending" "$PROJECT_DIR/VERSION" + success "VERSION promoted to $(head -1 "$PROJECT_DIR/VERSION" 2>/dev/null || echo "?")" +fi + # ============================================================================= # Summary # ============================================================================= @@ -1232,12 +1355,6 @@ else FINAL_COMMIT="$(head -1 "$PROJECT_DIR/VERSION" 2>/dev/null || echo "release")" fi -# Collect warnings for API mode result -UPGRADE_WARNINGS="[]" -if [[ "$VERIFY_FAILED" == "true" ]]; then - UPGRADE_WARNINGS='["Some health checks failed after upgrade — services may still be starting"]' -fi - write_progress 7 "Verification" 100 "Upgrade complete!" write_result "true" "Upgraded ${PRE_UPGRADE_SHORT} → ${FINAL_COMMIT} (${COMMIT_COUNT} commits)" "$UPGRADE_WARNINGS"