From 47704667b14aa99e68fd4ef95ebdb7ed112594c3 Mon Sep 17 00:00:00 2001
From: bunker-admin <admin@thebunkerops.ca>
Date: Wed, 15 Apr 2026 16:13:04 -0600
Subject: [PATCH] Upgrade failure visibility + atomic VERSION + external smoke
 test
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Three fixes to harden the admin-UI upgrade path, all in scripts/upgrade.sh.
Root-caused by yesterday's v2.9.2 → v2.9.3 on marcelle which was killed by
systemd mid-Phase-4 and left the system in a misleading half-upgraded state
(VERSION bumped, container pre-upgrade, result.json stale from 24h prior).

- Fix A (failure visibility): stop silencing stderr on the five docker
  compose pull sites so timeouts / auth failures / network errors flow
  into upgrade-watcher.log. Add explicit SIGTERM/SIGINT traps alongside
  the existing EXIT trap. Track CURRENT_PHASE_NAME globally so the
  failure message reports "during Phase 4: Container Rebuild" rather
  than just an exit code. Introduce write_result_force (bypasses
  API_MODE guard) + archive_failure_to_history so a killed upgrade
  always leaves a truthful result.json + history.json entry, and the
  progress.json is cleared so the admin UI stops showing a phantom
  in-progress phase.

- Fix B (atomic VERSION): Phase 3 rsync now --excludes VERSION and
  stashes the new one at data/upgrade/VERSION.pending. Phase 7 promotes
  it to VERSION only after VERIFY_FAILED stays false. on_failure deletes
  the pending file. upgrade-check.sh needs no changes — its head -1
  VERSION read sees actual state instead of a mid-upgrade promise.

- Fix C (external smoke): after Phase 7 localhost checks, curl
  https://api.${DOMAIN}/api/health with --max-time 10 and warn (not
  fail) on non-200. Catches Pangolin resource misassignments that the
  localhost-only checks miss. Appends to UPGRADE_WARNINGS so the admin
  UI surfaces it in result.json.

Bunker Admin
---
 scripts/upgrade.sh | 149 ++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 133 insertions(+), 16 deletions(-)

diff --git a/scripts/upgrade.sh b/scripts/upgrade.sh
index c2a0ce93..0ec560f5 100755
--- a/scripts/upgrade.sh
+++ b/scripts/upgrade.sh
@@ -18,6 +18,14 @@ HEALTH_TIMEOUT=120
 HEALTH_INTERVAL=5
 MIN_DISK_MB=2048
 
+# Tracks which phase the upgrade is currently in, so on_failure can report
+# "killed during Phase 4: Container Rebuild" instead of just an exit code.
+CURRENT_PHASE_NAME=""
+# Warnings accumulated during the run — surfaced in result.json. Global so
+# Phase 7 probes (external reachability) can append without losing earlier
+# entries set in the Phase 7 verification block.
+UPGRADE_WARNINGS="[]"
+
 # Source-built containers (always rebuilt)
 SOURCE_CONTAINERS="api admin media-api"
 # Conditionally rebuilt if Dockerfile changed
@@ -93,8 +101,10 @@ PROGRESS_FILE="${UPGRADE_DIR}/progress.json"
 RESULT_FILE="${UPGRADE_DIR}/result.json"
 
 write_progress() {
-  [[ "$API_MODE" != "true" ]] && return
   local phase_num="$1" phase_name="$2" pct="$3" msg="$4"
+  # Track phase name for on_failure regardless of API_MODE — useful for logs too.
+  CURRENT_PHASE_NAME="$phase_name"
+  [[ "$API_MODE" != "true" ]] && return
   mkdir -p "$UPGRADE_DIR"
   cat > "$PROGRESS_FILE" <<PEOF
 {
@@ -266,21 +276,84 @@ print_rollback_help() {
 }
 
 # --- Failure trap ---
+# Fires on non-zero exit OR explicit SIGTERM/SIGINT. Writes a truthful
+# failure result, discards the staged VERSION (Fix B), clears progress so
+# the admin UI stops showing a frozen phase, and archives to history so
+# the failure is retrievable.
 on_failure() {
   local exit_code=$?
   # Clean up user path save directory if it exists
   if [[ -n "${USER_SAVE_DIR:-}" ]] && [[ -d "${USER_SAVE_DIR:-}" ]]; then
     rm -rf "$USER_SAVE_DIR"
   fi
+  # Discard staged VERSION — the bump must only happen after full success.
+  rm -f "${UPGRADE_DIR}/VERSION.pending" 2>/dev/null || true
   release_lock
   if [[ $exit_code -ne 0 ]] && [[ "$DRY_RUN" != "true" ]]; then
-    error "Upgrade failed at line ${BASH_LINENO[0]} (exit code $exit_code)"
-    write_result "false" "Upgrade failed at line ${BASH_LINENO[0]} (exit code ${exit_code})"
+    local phase_tag="${CURRENT_PHASE_NAME:-unknown phase}"
+    local fail_msg="Upgrade failed during ${phase_tag} at line ${BASH_LINENO[0]} (exit ${exit_code})"
+    error "$fail_msg"
+    # Always write the failure result — previously gated behind API_MODE,
+    # which meant SIGTERM during a watcher-triggered upgrade left stale
+    # success data in result.json.
+    write_result_force "false" "$fail_msg"
+    # Clear progress so the admin UI doesn't show a phantom in-progress phase.
+    rm -f "$PROGRESS_FILE" 2>/dev/null || true
+    # Append to history so the failure is discoverable later.
+    archive_failure_to_history "$fail_msg"
     print_rollback_help
     info "Log file: $LOG_FILE"
   fi
 }
 
+# Same as write_result but bypasses the API_MODE guard. Used by on_failure
+# to ensure a failure record always lands, even in non-API-mode runs.
+write_result_force() {
+  local success="$1" msg="$2"
+  local duration_secs=$((SECONDS - ${START_TIME:-SECONDS}))
+  mkdir -p "$UPGRADE_DIR"
+  cat > "$RESULT_FILE" <<REOF
+{
+  "success": ${success},
+  "message": "$(echo "$msg" | sed 's/"/\\"/g')",
+  "previousCommit": "${PRE_UPGRADE_SHORT:-unknown}",
+  "newCommit": "$(head -1 "$PROJECT_DIR/VERSION" 2>/dev/null || git rev-parse --short HEAD 2>/dev/null || echo "unknown")",
+  "commitCount": ${COMMIT_COUNT:-0},
+  "durationSeconds": ${duration_secs},
+  "warnings": ${UPGRADE_WARNINGS:-[]},
+  "completedAt": "$(date -u +%Y-%m-%dT%H:%M:%SZ)"
+}
+REOF
+}
+
+# Append a failure record to history.json (newest first, capped at 50 entries
+# to match MAX_HISTORY_ENTRIES in api/src/modules/upgrade/upgrade.service.ts).
+archive_failure_to_history() {
+  local msg="$1"
+  local hist="${UPGRADE_DIR}/history.json"
+  mkdir -p "$UPGRADE_DIR"
+  local entry
+  entry="$(cat <<HEOF
+{"success":false,"message":"$(echo "$msg" | sed 's/"/\\"/g')","previousCommit":"${PRE_UPGRADE_SHORT:-unknown}","newCommit":"$(head -1 "$PROJECT_DIR/VERSION" 2>/dev/null || echo "unknown")","commitCount":${COMMIT_COUNT:-0},"durationSeconds":$((SECONDS - ${START_TIME:-SECONDS})),"warnings":[],"completedAt":"$(date -u +%Y-%m-%dT%H:%M:%SZ)"}
+HEOF
+  )"
+  python3 - "$hist" "$entry" <<'PYEOF' 2>/dev/null || true
+import json, sys, os
+hist_path, entry_json = sys.argv[1], sys.argv[2]
+try:
+    with open(hist_path) as f:
+        history = json.load(f)
+    if not isinstance(history, list):
+        history = []
+except Exception:
+    history = []
+history.insert(0, json.loads(entry_json))
+history = history[:50]
+with open(hist_path, 'w') as f:
+    json.dump(history, f, indent=2)
+PYEOF
+}
+
 # =============================================================================
 # Parse Arguments
 # =============================================================================
@@ -356,6 +429,11 @@ if [[ "$DRY_RUN" == "true" ]]; then
 fi
 
 trap on_failure EXIT
+# Explicit SIGTERM/SIGINT traps: bash runs EXIT on these too in theory, but
+# the marcelle v2.9.2 → v2.9.3 SIGTERM-kill showed no failure result was
+# written. Belt-and-suspenders — worst case it fires twice, and write_result
+# uses `>` so the second write is idempotent.
+trap on_failure TERM INT
 acquire_lock
 load_env
 
@@ -652,9 +730,13 @@ for a in json.load(sys.stdin).get('assets', []):
     # Save user paths
     save_user_paths
 
-    # Sync new files, preserving .env
+    # Sync new files, preserving .env. VERSION is staged to a pending
+    # location and only promoted after Phase 7 verification succeeds (Fix B),
+    # so interrupted upgrades don't leave a misleading "upgraded" marker.
     write_progress 3 "Code Update" 40 "Applying update..."
-    rsync -a --exclude='.env' "$UPDATE_SRC/" "$PROJECT_DIR/"
+    rsync -a --exclude='.env' --exclude='VERSION' "$UPDATE_SRC/" "$PROJECT_DIR/"
+    mkdir -p "$UPGRADE_DIR"
+    cp "$UPDATE_SRC/VERSION" "$UPGRADE_DIR/VERSION.pending"
 
     # Restore user paths
     restore_user_paths
@@ -861,14 +943,17 @@ if [[ "$USE_REGISTRY" == "true" ]]; then
   write_progress 4 "Container Rebuild" 55 "Pulling images from registry..."
 
   # Pull core app containers: try SHA tag → :latest fallback → source build
+  # NOTE: stderr intentionally flows through so slow/broken pulls are visible
+  # in logs/upgrade-watcher.log. Previously silenced, which left the v2.9.3
+  # systemd-killed upgrade with zero error trace.
   PULLED_TAG=""
-  if docker compose pull api admin media-api 2>/dev/null; then
+  if docker compose pull api admin media-api; then
     success "Core images pulled from registry (tag: ${REGISTRY_TAG})"
     PULLED_TAG="$REGISTRY_TAG"
   elif [[ "$REGISTRY_TAG" != "latest" ]]; then
     warn "Tag :${REGISTRY_TAG} not in registry — trying :latest"
     export IMAGE_TAG="latest"
-    if docker compose pull api admin media-api 2>/dev/null; then
+    if docker compose pull api admin media-api; then
       success "Core images pulled from registry (tag: latest)"
       PULLED_TAG="latest"
       # Retag :latest as :SHA so compose up uses consistent tags
@@ -891,12 +976,12 @@ if [[ "$USE_REGISTRY" == "true" ]]; then
 
   # nginx: try SHA → :latest → rebuild if config changed
   NGINX_PULLED=false
-  if docker compose pull nginx 2>/dev/null; then
+  if docker compose pull nginx; then
     success "nginx pulled from registry (tag: ${IMAGE_TAG})"
     NGINX_PULLED=true
   elif [[ "$REGISTRY_TAG" != "latest" ]]; then
     export IMAGE_TAG="latest"
-    if docker compose pull nginx 2>/dev/null; then
+    if docker compose pull nginx; then
       docker tag "${REGISTRY}/changemaker-nginx:latest" "${REGISTRY}/changemaker-nginx:${REGISTRY_TAG}" 2>/dev/null || true
       success "nginx pulled from registry (tag: latest)"
       NGINX_PULLED=true
@@ -938,7 +1023,7 @@ fi
 # Optionally pull third-party images
 if [[ "$PULL_SERVICES" == "true" ]]; then
   info "Pulling latest third-party images..."
-  docker compose pull v2-postgres redis listmonk-app listmonk-db gitea-app nocodb-v2 mailhog 2>/dev/null || true
+  docker compose pull v2-postgres redis listmonk-app listmonk-db gitea-app nocodb-v2 mailhog || true
   success "Third-party images updated"
 
   # Record image digests for audit trail
@@ -1217,10 +1302,48 @@ fi
 if [[ "$VERIFY_FAILED" == "true" ]]; then
   warn "Some health checks failed. Services may still be starting."
   info "Check logs: docker compose logs --tail 50"
+  UPGRADE_WARNINGS='["Some health checks failed after upgrade — services may still be starting"]'
 else
   success "All health checks passed"
 fi
 
+# --- External reachability probe (Fix C) ---
+# Non-fatal: a Pangolin resource misassignment or DNS flap wouldn't be
+# caught by the localhost-only checks above. Warn (don't fail) because
+# transient tunnel issues should not roll back a successful upgrade.
+if [[ -n "${DOMAIN:-}" ]] && command -v curl >/dev/null 2>&1; then
+  info "Probing external API at https://api.${DOMAIN}/api/health ..."
+  EXT_CODE="$(curl -s -o /dev/null -w '%{http_code}' --max-time 10 \
+    "https://api.${DOMAIN}/api/health" 2>/dev/null || echo "000")"
+  if [[ "$EXT_CODE" == "200" ]]; then
+    success "External API reachable (HTTP 200)"
+  else
+    warn "External API probe returned HTTP ${EXT_CODE} — check Pangolin tunnel"
+    UPGRADE_WARNINGS="$(python3 - "$UPGRADE_WARNINGS" "$DOMAIN" "$EXT_CODE" <<'PYEOF' 2>/dev/null || echo "$UPGRADE_WARNINGS"
+import json, sys
+try:
+    w = json.loads(sys.argv[1]) if sys.argv[1] else []
+except Exception:
+    w = []
+if not isinstance(w, list):
+    w = []
+w.append(f"External API https://api.{sys.argv[2]}/api/health returned HTTP {sys.argv[3]}")
+print(json.dumps(w))
+PYEOF
+)"
+  fi
+fi
+
+# --- Atomic VERSION promotion (Fix B) ---
+# The staged VERSION from Phase 3 lands only now, after full verification.
+# On any prior failure, on_failure removes VERSION.pending and the live
+# VERSION file remains at the pre-upgrade value — so upgrade-check.sh
+# correctly reports "upgrade available" on the next check.
+if [[ "$VERIFY_FAILED" != "true" ]] && [[ -f "$UPGRADE_DIR/VERSION.pending" ]]; then
+  mv "$UPGRADE_DIR/VERSION.pending" "$PROJECT_DIR/VERSION"
+  success "VERSION promoted to $(head -1 "$PROJECT_DIR/VERSION" 2>/dev/null || echo "?")"
+fi
+
 # =============================================================================
 # Summary
 # =============================================================================
@@ -1232,12 +1355,6 @@ else
   FINAL_COMMIT="$(head -1 "$PROJECT_DIR/VERSION" 2>/dev/null || echo "release")"
 fi
 
-# Collect warnings for API mode result
-UPGRADE_WARNINGS="[]"
-if [[ "$VERIFY_FAILED" == "true" ]]; then
-  UPGRADE_WARNINGS='["Some health checks failed after upgrade — services may still be starting"]'
-fi
-
 write_progress 7 "Verification" 100 "Upgrade complete!"
 write_result "true" "Upgraded ${PRE_UPGRADE_SHORT} → ${FINAL_COMMIT} (${COMMIT_COUNT} commits)" "$UPGRADE_WARNINGS"