Upgrade failure visibility + atomic VERSION + external smoke test
Three fixes to harden the admin-UI upgrade path, all in scripts/upgrade.sh. Root-caused by yesterday's v2.9.2 → v2.9.3 on marcelle which was killed by systemd mid-Phase-4 and left the system in a misleading half-upgraded state (VERSION bumped, container pre-upgrade, result.json stale from 24h prior). - Fix A (failure visibility): stop silencing stderr on the five docker compose pull sites so timeouts / auth failures / network errors flow into upgrade-watcher.log. Add explicit SIGTERM/SIGINT traps alongside the existing EXIT trap. Track CURRENT_PHASE_NAME globally so the failure message reports "during Phase 4: Container Rebuild" rather than just an exit code. Introduce write_result_force (bypasses API_MODE guard) + archive_failure_to_history so a killed upgrade always leaves a truthful result.json + history.json entry, and the progress.json is cleared so the admin UI stops showing a phantom in-progress phase. - Fix B (atomic VERSION): Phase 3 rsync now --excludes VERSION and stashes the new one at data/upgrade/VERSION.pending. Phase 7 promotes it to VERSION only after VERIFY_FAILED stays false. on_failure deletes the pending file. upgrade-check.sh needs no changes — its head -1 VERSION read sees actual state instead of a mid-upgrade promise. - Fix C (external smoke): after Phase 7 localhost checks, curl https://api.${DOMAIN}/api/health with --max-time 10 and warn (not fail) on non-200. Catches Pangolin resource misassignments that the localhost-only checks miss. Appends to UPGRADE_WARNINGS so the admin UI surfaces it in result.json. Bunker Admin
This commit is contained in:
parent
12708e5824
commit
47704667b1
@ -18,6 +18,14 @@ HEALTH_TIMEOUT=120
|
||||
HEALTH_INTERVAL=5
|
||||
MIN_DISK_MB=2048
|
||||
|
||||
# Tracks which phase the upgrade is currently in, so on_failure can report
|
||||
# "killed during Phase 4: Container Rebuild" instead of just an exit code.
|
||||
CURRENT_PHASE_NAME=""
|
||||
# Warnings accumulated during the run — surfaced in result.json. Global so
|
||||
# Phase 7 probes (external reachability) can append without losing earlier
|
||||
# entries set in the Phase 7 verification block.
|
||||
UPGRADE_WARNINGS="[]"
|
||||
|
||||
# Source-built containers (always rebuilt)
|
||||
SOURCE_CONTAINERS="api admin media-api"
|
||||
# Conditionally rebuilt if Dockerfile changed
|
||||
@ -93,8 +101,10 @@ PROGRESS_FILE="${UPGRADE_DIR}/progress.json"
|
||||
RESULT_FILE="${UPGRADE_DIR}/result.json"
|
||||
|
||||
write_progress() {
|
||||
[[ "$API_MODE" != "true" ]] && return
|
||||
local phase_num="$1" phase_name="$2" pct="$3" msg="$4"
|
||||
# Track phase name for on_failure regardless of API_MODE — useful for logs too.
|
||||
CURRENT_PHASE_NAME="$phase_name"
|
||||
[[ "$API_MODE" != "true" ]] && return
|
||||
mkdir -p "$UPGRADE_DIR"
|
||||
cat > "$PROGRESS_FILE" <<PEOF
|
||||
{
|
||||
@ -266,21 +276,84 @@ print_rollback_help() {
|
||||
}
|
||||
|
||||
# --- Failure trap ---
|
||||
# Fires on non-zero exit OR explicit SIGTERM/SIGINT. Writes a truthful
|
||||
# failure result, discards the staged VERSION (Fix B), clears progress so
|
||||
# the admin UI stops showing a frozen phase, and archives to history so
|
||||
# the failure is retrievable.
|
||||
on_failure() {
|
||||
local exit_code=$?
|
||||
# Clean up user path save directory if it exists
|
||||
if [[ -n "${USER_SAVE_DIR:-}" ]] && [[ -d "${USER_SAVE_DIR:-}" ]]; then
|
||||
rm -rf "$USER_SAVE_DIR"
|
||||
fi
|
||||
# Discard staged VERSION — the bump must only happen after full success.
|
||||
rm -f "${UPGRADE_DIR}/VERSION.pending" 2>/dev/null || true
|
||||
release_lock
|
||||
if [[ $exit_code -ne 0 ]] && [[ "$DRY_RUN" != "true" ]]; then
|
||||
error "Upgrade failed at line ${BASH_LINENO[0]} (exit code $exit_code)"
|
||||
write_result "false" "Upgrade failed at line ${BASH_LINENO[0]} (exit code ${exit_code})"
|
||||
local phase_tag="${CURRENT_PHASE_NAME:-unknown phase}"
|
||||
local fail_msg="Upgrade failed during ${phase_tag} at line ${BASH_LINENO[0]} (exit ${exit_code})"
|
||||
error "$fail_msg"
|
||||
# Always write the failure result — previously gated behind API_MODE,
|
||||
# which meant SIGTERM during a watcher-triggered upgrade left stale
|
||||
# success data in result.json.
|
||||
write_result_force "false" "$fail_msg"
|
||||
# Clear progress so the admin UI doesn't show a phantom in-progress phase.
|
||||
rm -f "$PROGRESS_FILE" 2>/dev/null || true
|
||||
# Append to history so the failure is discoverable later.
|
||||
archive_failure_to_history "$fail_msg"
|
||||
print_rollback_help
|
||||
info "Log file: $LOG_FILE"
|
||||
fi
|
||||
}
|
||||
|
||||
# Same as write_result but bypasses the API_MODE guard. Used by on_failure
|
||||
# to ensure a failure record always lands, even in non-API-mode runs.
|
||||
write_result_force() {
|
||||
local success="$1" msg="$2"
|
||||
local duration_secs=$((SECONDS - ${START_TIME:-SECONDS}))
|
||||
mkdir -p "$UPGRADE_DIR"
|
||||
cat > "$RESULT_FILE" <<REOF
|
||||
{
|
||||
"success": ${success},
|
||||
"message": "$(echo "$msg" | sed 's/"/\\"/g')",
|
||||
"previousCommit": "${PRE_UPGRADE_SHORT:-unknown}",
|
||||
"newCommit": "$(head -1 "$PROJECT_DIR/VERSION" 2>/dev/null || git rev-parse --short HEAD 2>/dev/null || echo "unknown")",
|
||||
"commitCount": ${COMMIT_COUNT:-0},
|
||||
"durationSeconds": ${duration_secs},
|
||||
"warnings": ${UPGRADE_WARNINGS:-[]},
|
||||
"completedAt": "$(date -u +%Y-%m-%dT%H:%M:%SZ)"
|
||||
}
|
||||
REOF
|
||||
}
|
||||
|
||||
# Append a failure record to history.json (newest first, capped at 50 entries
|
||||
# to match MAX_HISTORY_ENTRIES in api/src/modules/upgrade/upgrade.service.ts).
|
||||
archive_failure_to_history() {
|
||||
local msg="$1"
|
||||
local hist="${UPGRADE_DIR}/history.json"
|
||||
mkdir -p "$UPGRADE_DIR"
|
||||
local entry
|
||||
entry="$(cat <<HEOF
|
||||
{"success":false,"message":"$(echo "$msg" | sed 's/"/\\"/g')","previousCommit":"${PRE_UPGRADE_SHORT:-unknown}","newCommit":"$(head -1 "$PROJECT_DIR/VERSION" 2>/dev/null || echo "unknown")","commitCount":${COMMIT_COUNT:-0},"durationSeconds":$((SECONDS - ${START_TIME:-SECONDS})),"warnings":[],"completedAt":"$(date -u +%Y-%m-%dT%H:%M:%SZ)"}
|
||||
HEOF
|
||||
)"
|
||||
python3 - "$hist" "$entry" <<'PYEOF' 2>/dev/null || true
|
||||
import json, sys, os
|
||||
hist_path, entry_json = sys.argv[1], sys.argv[2]
|
||||
try:
|
||||
with open(hist_path) as f:
|
||||
history = json.load(f)
|
||||
if not isinstance(history, list):
|
||||
history = []
|
||||
except Exception:
|
||||
history = []
|
||||
history.insert(0, json.loads(entry_json))
|
||||
history = history[:50]
|
||||
with open(hist_path, 'w') as f:
|
||||
json.dump(history, f, indent=2)
|
||||
PYEOF
|
||||
}
|
||||
|
||||
# =============================================================================
|
||||
# Parse Arguments
|
||||
# =============================================================================
|
||||
@ -356,6 +429,11 @@ if [[ "$DRY_RUN" == "true" ]]; then
|
||||
fi
|
||||
|
||||
trap on_failure EXIT
|
||||
# Explicit SIGTERM/SIGINT traps: bash runs EXIT on these too in theory, but
|
||||
# the marcelle v2.9.2 → v2.9.3 SIGTERM-kill showed no failure result was
|
||||
# written. Belt-and-suspenders — worst case it fires twice, and write_result
|
||||
# uses `>` so the second write is idempotent.
|
||||
trap on_failure TERM INT
|
||||
acquire_lock
|
||||
load_env
|
||||
|
||||
@ -652,9 +730,13 @@ for a in json.load(sys.stdin).get('assets', []):
|
||||
# Save user paths
|
||||
save_user_paths
|
||||
|
||||
# Sync new files, preserving .env
|
||||
# Sync new files, preserving .env. VERSION is staged to a pending
|
||||
# location and only promoted after Phase 7 verification succeeds (Fix B),
|
||||
# so interrupted upgrades don't leave a misleading "upgraded" marker.
|
||||
write_progress 3 "Code Update" 40 "Applying update..."
|
||||
rsync -a --exclude='.env' "$UPDATE_SRC/" "$PROJECT_DIR/"
|
||||
rsync -a --exclude='.env' --exclude='VERSION' "$UPDATE_SRC/" "$PROJECT_DIR/"
|
||||
mkdir -p "$UPGRADE_DIR"
|
||||
cp "$UPDATE_SRC/VERSION" "$UPGRADE_DIR/VERSION.pending"
|
||||
|
||||
# Restore user paths
|
||||
restore_user_paths
|
||||
@ -861,14 +943,17 @@ if [[ "$USE_REGISTRY" == "true" ]]; then
|
||||
write_progress 4 "Container Rebuild" 55 "Pulling images from registry..."
|
||||
|
||||
# Pull core app containers: try SHA tag → :latest fallback → source build
|
||||
# NOTE: stderr intentionally flows through so slow/broken pulls are visible
|
||||
# in logs/upgrade-watcher.log. Previously silenced, which left the v2.9.3
|
||||
# systemd-killed upgrade with zero error trace.
|
||||
PULLED_TAG=""
|
||||
if docker compose pull api admin media-api 2>/dev/null; then
|
||||
if docker compose pull api admin media-api; then
|
||||
success "Core images pulled from registry (tag: ${REGISTRY_TAG})"
|
||||
PULLED_TAG="$REGISTRY_TAG"
|
||||
elif [[ "$REGISTRY_TAG" != "latest" ]]; then
|
||||
warn "Tag :${REGISTRY_TAG} not in registry — trying :latest"
|
||||
export IMAGE_TAG="latest"
|
||||
if docker compose pull api admin media-api 2>/dev/null; then
|
||||
if docker compose pull api admin media-api; then
|
||||
success "Core images pulled from registry (tag: latest)"
|
||||
PULLED_TAG="latest"
|
||||
# Retag :latest as :SHA so compose up uses consistent tags
|
||||
@ -891,12 +976,12 @@ if [[ "$USE_REGISTRY" == "true" ]]; then
|
||||
|
||||
# nginx: try SHA → :latest → rebuild if config changed
|
||||
NGINX_PULLED=false
|
||||
if docker compose pull nginx 2>/dev/null; then
|
||||
if docker compose pull nginx; then
|
||||
success "nginx pulled from registry (tag: ${IMAGE_TAG})"
|
||||
NGINX_PULLED=true
|
||||
elif [[ "$REGISTRY_TAG" != "latest" ]]; then
|
||||
export IMAGE_TAG="latest"
|
||||
if docker compose pull nginx 2>/dev/null; then
|
||||
if docker compose pull nginx; then
|
||||
docker tag "${REGISTRY}/changemaker-nginx:latest" "${REGISTRY}/changemaker-nginx:${REGISTRY_TAG}" 2>/dev/null || true
|
||||
success "nginx pulled from registry (tag: latest)"
|
||||
NGINX_PULLED=true
|
||||
@ -938,7 +1023,7 @@ fi
|
||||
# Optionally pull third-party images
|
||||
if [[ "$PULL_SERVICES" == "true" ]]; then
|
||||
info "Pulling latest third-party images..."
|
||||
docker compose pull v2-postgres redis listmonk-app listmonk-db gitea-app nocodb-v2 mailhog 2>/dev/null || true
|
||||
docker compose pull v2-postgres redis listmonk-app listmonk-db gitea-app nocodb-v2 mailhog || true
|
||||
success "Third-party images updated"
|
||||
|
||||
# Record image digests for audit trail
|
||||
@ -1217,10 +1302,48 @@ fi
|
||||
if [[ "$VERIFY_FAILED" == "true" ]]; then
|
||||
warn "Some health checks failed. Services may still be starting."
|
||||
info "Check logs: docker compose logs --tail 50"
|
||||
UPGRADE_WARNINGS='["Some health checks failed after upgrade — services may still be starting"]'
|
||||
else
|
||||
success "All health checks passed"
|
||||
fi
|
||||
|
||||
# --- External reachability probe (Fix C) ---
|
||||
# Non-fatal: a Pangolin resource misassignment or DNS flap wouldn't be
|
||||
# caught by the localhost-only checks above. Warn (don't fail) because
|
||||
# transient tunnel issues should not roll back a successful upgrade.
|
||||
if [[ -n "${DOMAIN:-}" ]] && command -v curl >/dev/null 2>&1; then
|
||||
info "Probing external API at https://api.${DOMAIN}/api/health ..."
|
||||
EXT_CODE="$(curl -s -o /dev/null -w '%{http_code}' --max-time 10 \
|
||||
"https://api.${DOMAIN}/api/health" 2>/dev/null || echo "000")"
|
||||
if [[ "$EXT_CODE" == "200" ]]; then
|
||||
success "External API reachable (HTTP 200)"
|
||||
else
|
||||
warn "External API probe returned HTTP ${EXT_CODE} — check Pangolin tunnel"
|
||||
UPGRADE_WARNINGS="$(python3 - "$UPGRADE_WARNINGS" "$DOMAIN" "$EXT_CODE" <<'PYEOF' 2>/dev/null || echo "$UPGRADE_WARNINGS"
|
||||
import json, sys
|
||||
try:
|
||||
w = json.loads(sys.argv[1]) if sys.argv[1] else []
|
||||
except Exception:
|
||||
w = []
|
||||
if not isinstance(w, list):
|
||||
w = []
|
||||
w.append(f"External API https://api.{sys.argv[2]}/api/health returned HTTP {sys.argv[3]}")
|
||||
print(json.dumps(w))
|
||||
PYEOF
|
||||
)"
|
||||
fi
|
||||
fi
|
||||
|
||||
# --- Atomic VERSION promotion (Fix B) ---
|
||||
# The staged VERSION from Phase 3 lands only now, after full verification.
|
||||
# On any prior failure, on_failure removes VERSION.pending and the live
|
||||
# VERSION file remains at the pre-upgrade value — so upgrade-check.sh
|
||||
# correctly reports "upgrade available" on the next check.
|
||||
if [[ "$VERIFY_FAILED" != "true" ]] && [[ -f "$UPGRADE_DIR/VERSION.pending" ]]; then
|
||||
mv "$UPGRADE_DIR/VERSION.pending" "$PROJECT_DIR/VERSION"
|
||||
success "VERSION promoted to $(head -1 "$PROJECT_DIR/VERSION" 2>/dev/null || echo "?")"
|
||||
fi
|
||||
|
||||
# =============================================================================
|
||||
# Summary
|
||||
# =============================================================================
|
||||
@ -1232,12 +1355,6 @@ else
|
||||
FINAL_COMMIT="$(head -1 "$PROJECT_DIR/VERSION" 2>/dev/null || echo "release")"
|
||||
fi
|
||||
|
||||
# Collect warnings for API mode result
|
||||
UPGRADE_WARNINGS="[]"
|
||||
if [[ "$VERIFY_FAILED" == "true" ]]; then
|
||||
UPGRADE_WARNINGS='["Some health checks failed after upgrade — services may still be starting"]'
|
||||
fi
|
||||
|
||||
write_progress 7 "Verification" 100 "Upgrade complete!"
|
||||
write_result "true" "Upgraded ${PRE_UPGRADE_SHORT} → ${FINAL_COMMIT} (${COMMIT_COUNT} commits)" "$UPGRADE_WARNINGS"
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user