Upgrade failure visibility + atomic VERSION + external smoke test

Three fixes to harden the admin-UI upgrade path, all in scripts/upgrade.sh.
Root-caused by yesterday's v2.9.2 → v2.9.3 on marcelle which was killed by
systemd mid-Phase-4 and left the system in a misleading half-upgraded state
(VERSION bumped, container pre-upgrade, result.json stale from 24h prior).

- Fix A (failure visibility): stop silencing stderr on the five docker
  compose pull sites so timeouts / auth failures / network errors flow
  into upgrade-watcher.log. Add explicit SIGTERM/SIGINT traps alongside
  the existing EXIT trap. Track CURRENT_PHASE_NAME globally so the
  failure message reports "during Phase 4: Container Rebuild" rather
  than just an exit code. Introduce write_result_force (bypasses
  API_MODE guard) + archive_failure_to_history so a killed upgrade
  always leaves a truthful result.json + history.json entry, and the
  progress.json is cleared so the admin UI stops showing a phantom
  in-progress phase.

- Fix B (atomic VERSION): Phase 3 rsync now --excludes VERSION and
  stashes the new one at data/upgrade/VERSION.pending. Phase 7 promotes
  it to VERSION only after VERIFY_FAILED stays false. on_failure deletes
  the pending file. upgrade-check.sh needs no changes — its head -1
  VERSION read sees actual state instead of a mid-upgrade promise.

- Fix C (external smoke): after Phase 7 localhost checks, curl
  https://api.${DOMAIN}/api/health with --max-time 10 and warn (not
  fail) on non-200. Catches Pangolin resource misassignments that the
  localhost-only checks miss. Appends to UPGRADE_WARNINGS so the admin
  UI surfaces it in result.json.

Bunker Admin
This commit is contained in:
bunker-admin 2026-04-15 16:13:04 -06:00
parent 12708e5824
commit 47704667b1

View File

@ -18,6 +18,14 @@ HEALTH_TIMEOUT=120
HEALTH_INTERVAL=5
MIN_DISK_MB=2048
# Tracks which phase the upgrade is currently in, so on_failure can report
# "killed during Phase 4: Container Rebuild" instead of just an exit code.
CURRENT_PHASE_NAME=""
# Warnings accumulated during the run — surfaced in result.json. Global so
# Phase 7 probes (external reachability) can append without losing earlier
# entries set in the Phase 7 verification block.
UPGRADE_WARNINGS="[]"
# Source-built containers (always rebuilt)
SOURCE_CONTAINERS="api admin media-api"
# Conditionally rebuilt if Dockerfile changed
@ -93,8 +101,10 @@ PROGRESS_FILE="${UPGRADE_DIR}/progress.json"
RESULT_FILE="${UPGRADE_DIR}/result.json"
write_progress() {
[[ "$API_MODE" != "true" ]] && return
local phase_num="$1" phase_name="$2" pct="$3" msg="$4"
# Track phase name for on_failure regardless of API_MODE — useful for logs too.
CURRENT_PHASE_NAME="$phase_name"
[[ "$API_MODE" != "true" ]] && return
mkdir -p "$UPGRADE_DIR"
cat > "$PROGRESS_FILE" <<PEOF
{
@ -266,21 +276,84 @@ print_rollback_help() {
}
# --- Failure trap ---
# Fires on non-zero exit OR explicit SIGTERM/SIGINT. Writes a truthful
# failure result, discards the staged VERSION (Fix B), clears progress so
# the admin UI stops showing a frozen phase, and archives to history so
# the failure is retrievable.
on_failure() {
local exit_code=$?
# Clean up user path save directory if it exists
if [[ -n "${USER_SAVE_DIR:-}" ]] && [[ -d "${USER_SAVE_DIR:-}" ]]; then
rm -rf "$USER_SAVE_DIR"
fi
# Discard staged VERSION — the bump must only happen after full success.
rm -f "${UPGRADE_DIR}/VERSION.pending" 2>/dev/null || true
release_lock
if [[ $exit_code -ne 0 ]] && [[ "$DRY_RUN" != "true" ]]; then
error "Upgrade failed at line ${BASH_LINENO[0]} (exit code $exit_code)"
write_result "false" "Upgrade failed at line ${BASH_LINENO[0]} (exit code ${exit_code})"
local phase_tag="${CURRENT_PHASE_NAME:-unknown phase}"
local fail_msg="Upgrade failed during ${phase_tag} at line ${BASH_LINENO[0]} (exit ${exit_code})"
error "$fail_msg"
# Always write the failure result — previously gated behind API_MODE,
# which meant SIGTERM during a watcher-triggered upgrade left stale
# success data in result.json.
write_result_force "false" "$fail_msg"
# Clear progress so the admin UI doesn't show a phantom in-progress phase.
rm -f "$PROGRESS_FILE" 2>/dev/null || true
# Append to history so the failure is discoverable later.
archive_failure_to_history "$fail_msg"
print_rollback_help
info "Log file: $LOG_FILE"
fi
}
# Same as write_result but bypasses the API_MODE guard. Used by on_failure
# to ensure a failure record always lands, even in non-API-mode runs.
write_result_force() {
local success="$1" msg="$2"
local duration_secs=$((SECONDS - ${START_TIME:-SECONDS}))
mkdir -p "$UPGRADE_DIR"
cat > "$RESULT_FILE" <<REOF
{
"success": ${success},
"message": "$(echo "$msg" | sed 's/"/\\"/g')",
"previousCommit": "${PRE_UPGRADE_SHORT:-unknown}",
"newCommit": "$(head -1 "$PROJECT_DIR/VERSION" 2>/dev/null || git rev-parse --short HEAD 2>/dev/null || echo "unknown")",
"commitCount": ${COMMIT_COUNT:-0},
"durationSeconds": ${duration_secs},
"warnings": ${UPGRADE_WARNINGS:-[]},
"completedAt": "$(date -u +%Y-%m-%dT%H:%M:%SZ)"
}
REOF
}
# Append a failure record to history.json (newest first, capped at 50 entries
# to match MAX_HISTORY_ENTRIES in api/src/modules/upgrade/upgrade.service.ts).
archive_failure_to_history() {
local msg="$1"
local hist="${UPGRADE_DIR}/history.json"
mkdir -p "$UPGRADE_DIR"
local entry
entry="$(cat <<HEOF
{"success":false,"message":"$(echo "$msg" | sed 's/"/\\"/g')","previousCommit":"${PRE_UPGRADE_SHORT:-unknown}","newCommit":"$(head -1 "$PROJECT_DIR/VERSION" 2>/dev/null || echo "unknown")","commitCount":${COMMIT_COUNT:-0},"durationSeconds":$((SECONDS - ${START_TIME:-SECONDS})),"warnings":[],"completedAt":"$(date -u +%Y-%m-%dT%H:%M:%SZ)"}
HEOF
)"
python3 - "$hist" "$entry" <<'PYEOF' 2>/dev/null || true
import json, sys, os
hist_path, entry_json = sys.argv[1], sys.argv[2]
try:
with open(hist_path) as f:
history = json.load(f)
if not isinstance(history, list):
history = []
except Exception:
history = []
history.insert(0, json.loads(entry_json))
history = history[:50]
with open(hist_path, 'w') as f:
json.dump(history, f, indent=2)
PYEOF
}
# =============================================================================
# Parse Arguments
# =============================================================================
@ -356,6 +429,11 @@ if [[ "$DRY_RUN" == "true" ]]; then
fi
trap on_failure EXIT
# Explicit SIGTERM/SIGINT traps: bash runs EXIT on these too in theory, but
# the marcelle v2.9.2 → v2.9.3 SIGTERM-kill showed no failure result was
# written. Belt-and-suspenders — worst case it fires twice, and write_result
# uses `>` so the second write is idempotent.
trap on_failure TERM INT
acquire_lock
load_env
@ -652,9 +730,13 @@ for a in json.load(sys.stdin).get('assets', []):
# Save user paths
save_user_paths
# Sync new files, preserving .env
# Sync new files, preserving .env. VERSION is staged to a pending
# location and only promoted after Phase 7 verification succeeds (Fix B),
# so interrupted upgrades don't leave a misleading "upgraded" marker.
write_progress 3 "Code Update" 40 "Applying update..."
rsync -a --exclude='.env' "$UPDATE_SRC/" "$PROJECT_DIR/"
rsync -a --exclude='.env' --exclude='VERSION' "$UPDATE_SRC/" "$PROJECT_DIR/"
mkdir -p "$UPGRADE_DIR"
cp "$UPDATE_SRC/VERSION" "$UPGRADE_DIR/VERSION.pending"
# Restore user paths
restore_user_paths
@ -861,14 +943,17 @@ if [[ "$USE_REGISTRY" == "true" ]]; then
write_progress 4 "Container Rebuild" 55 "Pulling images from registry..."
# Pull core app containers: try SHA tag → :latest fallback → source build
# NOTE: stderr intentionally flows through so slow/broken pulls are visible
# in logs/upgrade-watcher.log. Previously silenced, which left the v2.9.3
# systemd-killed upgrade with zero error trace.
PULLED_TAG=""
if docker compose pull api admin media-api 2>/dev/null; then
if docker compose pull api admin media-api; then
success "Core images pulled from registry (tag: ${REGISTRY_TAG})"
PULLED_TAG="$REGISTRY_TAG"
elif [[ "$REGISTRY_TAG" != "latest" ]]; then
warn "Tag :${REGISTRY_TAG} not in registry — trying :latest"
export IMAGE_TAG="latest"
if docker compose pull api admin media-api 2>/dev/null; then
if docker compose pull api admin media-api; then
success "Core images pulled from registry (tag: latest)"
PULLED_TAG="latest"
# Retag :latest as :SHA so compose up uses consistent tags
@ -891,12 +976,12 @@ if [[ "$USE_REGISTRY" == "true" ]]; then
# nginx: try SHA → :latest → rebuild if config changed
NGINX_PULLED=false
if docker compose pull nginx 2>/dev/null; then
if docker compose pull nginx; then
success "nginx pulled from registry (tag: ${IMAGE_TAG})"
NGINX_PULLED=true
elif [[ "$REGISTRY_TAG" != "latest" ]]; then
export IMAGE_TAG="latest"
if docker compose pull nginx 2>/dev/null; then
if docker compose pull nginx; then
docker tag "${REGISTRY}/changemaker-nginx:latest" "${REGISTRY}/changemaker-nginx:${REGISTRY_TAG}" 2>/dev/null || true
success "nginx pulled from registry (tag: latest)"
NGINX_PULLED=true
@ -938,7 +1023,7 @@ fi
# Optionally pull third-party images
if [[ "$PULL_SERVICES" == "true" ]]; then
info "Pulling latest third-party images..."
docker compose pull v2-postgres redis listmonk-app listmonk-db gitea-app nocodb-v2 mailhog 2>/dev/null || true
docker compose pull v2-postgres redis listmonk-app listmonk-db gitea-app nocodb-v2 mailhog || true
success "Third-party images updated"
# Record image digests for audit trail
@ -1217,10 +1302,48 @@ fi
if [[ "$VERIFY_FAILED" == "true" ]]; then
warn "Some health checks failed. Services may still be starting."
info "Check logs: docker compose logs --tail 50"
UPGRADE_WARNINGS='["Some health checks failed after upgrade — services may still be starting"]'
else
success "All health checks passed"
fi
# --- External reachability probe (Fix C) ---
# Non-fatal: a Pangolin resource misassignment or DNS flap wouldn't be
# caught by the localhost-only checks above. Warn (don't fail) because
# transient tunnel issues should not roll back a successful upgrade.
if [[ -n "${DOMAIN:-}" ]] && command -v curl >/dev/null 2>&1; then
info "Probing external API at https://api.${DOMAIN}/api/health ..."
EXT_CODE="$(curl -s -o /dev/null -w '%{http_code}' --max-time 10 \
"https://api.${DOMAIN}/api/health" 2>/dev/null || echo "000")"
if [[ "$EXT_CODE" == "200" ]]; then
success "External API reachable (HTTP 200)"
else
warn "External API probe returned HTTP ${EXT_CODE} — check Pangolin tunnel"
UPGRADE_WARNINGS="$(python3 - "$UPGRADE_WARNINGS" "$DOMAIN" "$EXT_CODE" <<'PYEOF' 2>/dev/null || echo "$UPGRADE_WARNINGS"
import json, sys
try:
w = json.loads(sys.argv[1]) if sys.argv[1] else []
except Exception:
w = []
if not isinstance(w, list):
w = []
w.append(f"External API https://api.{sys.argv[2]}/api/health returned HTTP {sys.argv[3]}")
print(json.dumps(w))
PYEOF
)"
fi
fi
# --- Atomic VERSION promotion (Fix B) ---
# The staged VERSION from Phase 3 lands only now, after full verification.
# On any prior failure, on_failure removes VERSION.pending and the live
# VERSION file remains at the pre-upgrade value — so upgrade-check.sh
# correctly reports "upgrade available" on the next check.
if [[ "$VERIFY_FAILED" != "true" ]] && [[ -f "$UPGRADE_DIR/VERSION.pending" ]]; then
mv "$UPGRADE_DIR/VERSION.pending" "$PROJECT_DIR/VERSION"
success "VERSION promoted to $(head -1 "$PROJECT_DIR/VERSION" 2>/dev/null || echo "?")"
fi
# =============================================================================
# Summary
# =============================================================================
@ -1232,12 +1355,6 @@ else
FINAL_COMMIT="$(head -1 "$PROJECT_DIR/VERSION" 2>/dev/null || echo "release")"
fi
# Collect warnings for API mode result
UPGRADE_WARNINGS="[]"
if [[ "$VERIFY_FAILED" == "true" ]]; then
UPGRADE_WARNINGS='["Some health checks failed after upgrade — services may still be starting"]'
fi
write_progress 7 "Verification" 100 "Upgrade complete!"
write_result "true" "Upgraded ${PRE_UPGRADE_SHORT}${FINAL_COMMIT} (${COMMIT_COUNT} commits)" "$UPGRADE_WARNINGS"