fix(upgrade): Phase 1 of upgrade-flow redesign (Approach A)

Three coordinated fixes from the upgrade-flow redesign plan (/home/bunker-admin/.claude/plans/okay-so-we-can-enumerated-hejlsberg.md): 1. scripts/lib/mkdocs-snapshot.sh (NEW): pre-upgrade tarball snapshot of the entire mkdocs/ directory into the install root as mkdocs-backup-<timestamp>.tar.gz. Discoverable via `ls`, retained last 5. No-regrets fallback if anything in the upgrade goes sideways. Sourced by upgrade.sh (and later by image-upgrade.sh under Approach B). 2. scripts/upgrade.sh Phase 6 self-destruct fix: previously, the broad `docker compose up -d` recreated the ccp-agent container that was running the script, sending SIGKILL to the bash process before write_result could land result.json. Marcelle's test upgrade hit this tonight. Fix: temporarily remove `ccp-agent` from COMPOSE_PROFILES during Phase 6's broad up -d, then schedule a detached `nohup ... & disown` restart at the very end of the script (after write_result and archive_success_to_history). The deferred subshell sleeps 3s, then recreates ccp-agent under its profile, picking up the new image. 3. scripts/upgrade-stash-cleanup.sh (NEW): one-shot utility to list and drop accumulated `upgrade-*` git stashes left over by older upgrade.sh runs whose pop failed silently (Pride Corner has three from 2026-03-09 alone). Warns loudly if any stash holds tenant mkdocs.yml content so operators verify recovery before dropping. The .gitignore now excludes /mkdocs-backup-*.tar.gz so the rescue archives don't leak into commits. This is Phase 1 of three: Approach B (image-only upgrade mode) and Approach C (CCP template re-render) follow in subsequent commits. Bunker Admin
2026-05-20 20:43:34 -06:00 · 2026-05-20 20:43:34 -06:00 · 9613c3ec81
commit 9613c3ec81
parent e88ac79ae8
4 changed files with 275 additions and 2 deletions
--- a/.gitignore
+++ b/.gitignore
@ -64,6 +64,11 @@ core.*
 /backups/
 .upgrade.lock

+# Pre-upgrade mkdocs snapshots (created by scripts/lib/mkdocs-snapshot.sh).
+# These are the tenant-content rescue archives written before every upgrade;
+# discoverable in the install root via `ls`. Retention: last 5 (see helper).
+/mkdocs-backup-*.tar.gz
+
 # Release tarballs (generated by build-release.sh)
 /releases/

--- a/scripts/lib/mkdocs-snapshot.sh
+++ b/scripts/lib/mkdocs-snapshot.sh
@ -0,0 +1,81 @@
+#!/usr/bin/env bash
+# =============================================================================
+# mkdocs-snapshot.sh — shared library function
+# =============================================================================
+# Defines snapshot_mkdocs(): writes a tarball of mkdocs/ into the install root
+# as mkdocs-backup-<timestamp>.tar.gz, keeping the last 5 snapshots.
+#
+# Sourced by scripts/upgrade.sh and scripts/image-upgrade.sh (and may be
+# invoked agent-side by changemaker-control-panel during template re-render).
+#
+# Why the install root instead of backups/?
+#   - Discoverable: operators see mkdocs-backup-*.tar.gz with a plain `ls`.
+#   - The agent's /app/instance bind mount maps directly to the install root,
+#     so the agent can restore from this archive without path translation.
+#   - backups/ is owned by root in some installs (DB dumps via container)
+#     and gets rotated on a different schedule than docs snapshots.
+#
+# Restoration one-liner:
+#   tar xzf "$(ls -t mkdocs-backup-*.tar.gz | head -1)" -C . \
+#     && docker compose restart mkdocs mkdocs-site-server
+#
+# Requires: $PROJECT_DIR (absolute path to install root), info() function
+# from the caller (falls back to plain echo if info is not defined).
+# =============================================================================
+
+# Fallback log function if caller didn't define one (e.g. when sourcing standalone)
+if ! declare -F info >/dev/null 2>&1; then
+  info() { echo "[INFO] $*"; }
+fi
+if ! declare -F warn >/dev/null 2>&1; then
+  warn() { echo "[WARN] $*" >&2; }
+fi
+
+# snapshot_mkdocs — take a tarball of mkdocs/ into the install root.
+#
+# Returns 0 if successful (or if mkdocs/ doesn't exist — non-fatal).
+# Returns non-zero only if tar itself fails AND $SNAPSHOT_REQUIRED is true.
+#
+# Optional env vars:
+#   PROJECT_DIR      (required) Install root containing mkdocs/
+#   SNAPSHOT_KEEP    Number of snapshots to retain (default 5)
+#   SNAPSHOT_REQUIRED  If "true", failure to snapshot aborts (default false)
+snapshot_mkdocs() {
+  if [[ -z "${PROJECT_DIR:-}" ]]; then
+    warn "snapshot_mkdocs: PROJECT_DIR not set; skipping"
+    return 0
+  fi
+
+  if [[ ! -d "${PROJECT_DIR}/mkdocs" ]]; then
+    # No mkdocs dir = nothing to snapshot. Common on minimal installs.
+    return 0
+  fi
+
+  local stamp
+  stamp="$(date +%Y%m%d_%H%M%S)"
+  local archive="${PROJECT_DIR}/mkdocs-backup-${stamp}.tar.gz"
+  local keep="${SNAPSHOT_KEEP:-5}"
+
+  if tar czf "$archive" -C "$PROJECT_DIR" mkdocs 2>/dev/null; then
+    local size
+    size="$(du -h "$archive" 2>/dev/null | cut -f1)"
+    info "Tenant docs snapshot: $(basename "$archive") (${size})"
+  else
+    warn "snapshot_mkdocs: tar failed for $archive"
+    rm -f "$archive" 2>/dev/null
+    if [[ "${SNAPSHOT_REQUIRED:-false}" == "true" ]]; then
+      return 1
+    fi
+    return 0
+  fi
+
+  # Retention: keep the most recent N snapshots, prune older ones.
+  # ls -t lists newest first; tail -n +N+1 selects items after the Nth.
+  local prune_from=$((keep + 1))
+  # shellcheck disable=SC2012  # ls is intentional for mtime sort
+  ls -t "${PROJECT_DIR}"/mkdocs-backup-*.tar.gz 2>/dev/null \
+    | tail -n +${prune_from} \
+    | xargs -r rm -f
+
+  return 0
+}
--- a/scripts/upgrade-stash-cleanup.sh
+++ b/scripts/upgrade-stash-cleanup.sh
@ -0,0 +1,135 @@
+#!/usr/bin/env bash
+# =============================================================================
+# upgrade-stash-cleanup.sh — clean up stale upgrade-* git stashes
+# =============================================================================
+# Older versions of upgrade.sh used `git stash push --include-untracked` to
+# protect tenant content during pulls. When pop conflicts went unresolved,
+# the stashes accumulated in `git stash list` forever — Pride Corner ended up
+# with three from 2026-03-09 alone, each containing displaced tenant
+# customizations that the running site no longer reflected.
+#
+# This script lists every `upgrade-*` stash, shows its scope, and offers to
+# drop them. It does NOT auto-restore content; that's a separate decision per
+# tenant. The intent is to clear the backlog so future `git stash list` is
+# meaningful.
+#
+# Usage:
+#   bash scripts/upgrade-stash-cleanup.sh          # interactive, lists + prompts
+#   bash scripts/upgrade-stash-cleanup.sh --dry    # list only
+#   bash scripts/upgrade-stash-cleanup.sh --yes    # drop all upgrade-* without prompt
+# =============================================================================
+
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+PROJECT_DIR="$(dirname "$SCRIPT_DIR")"
+cd "$PROJECT_DIR"
+
+# Colors
+if [[ -t 1 ]] && [[ -z "${NO_COLOR:-}" ]]; then
+  RED='\033[0;31m' GREEN='\033[0;32m' YELLOW='\033[0;33m' CYAN='\033[0;36m'
+  BOLD='\033[1m' NC='\033[0m'
+else
+  RED='' GREEN='' YELLOW='' CYAN='' BOLD='' NC=''
+fi
+
+info() { echo -e "${CYAN}[INFO]${NC} $*"; }
+ok()   { echo -e "${GREEN}[ OK ]${NC} $*"; }
+warn() { echo -e "${YELLOW}[WARN]${NC} $*"; }
+
+DRY=false
+YES=false
+for arg in "$@"; do
+  case "$arg" in
+    --dry|--dry-run) DRY=true ;;
+    --yes|-y)        YES=true ;;
+    --help|-h)
+      sed -n '2,/^# =====/p' "$0" | sed -n '2,/^# =====/p' | sed 's/^# //;s/^#//'
+      exit 0
+      ;;
+  esac
+done
+
+if [[ ! -d .git ]]; then
+  warn "Not a git repository — this script only applies to source installs."
+  exit 0
+fi
+
+# Collect upgrade-* stash refs
+mapfile -t STASHES < <(git stash list 2>/dev/null | grep -E ': (On|WIP on) [^:]+: upgrade-' || true)
+
+if [[ ${#STASHES[@]} -eq 0 ]]; then
+  ok "No upgrade-* stashes found. Nothing to clean up."
+  exit 0
+fi
+
+echo ""
+echo -e "${BOLD}Found ${#STASHES[@]} upgrade-* stash(es):${NC}"
+echo ""
+for entry in "${STASHES[@]}"; do
+  REF="${entry%%:*}"
+  LABEL="${entry#*: }"
+  FILE_COUNT=$(git stash show "$REF" --name-only 2>/dev/null | wc -l)
+  HAS_MKDOCS_YML=$(git stash show "$REF" --name-only 2>/dev/null | grep -c '^mkdocs/mkdocs\.yml$' || true)
+  printf "  %-12s  %-50s  files=%-4d  mkdocs.yml=%s\n" \
+    "$REF" "$LABEL" "$FILE_COUNT" "$HAS_MKDOCS_YML"
+done
+echo ""
+
+if [[ "$DRY" == "true" ]]; then
+  info "Dry-run: no stashes will be dropped."
+  exit 0
+fi
+
+# Warn loudly if any stash holds mkdocs.yml — operator should manually review
+# before dropping (tenant content might be there).
+MKDOCS_STASHES=$(printf '%s\n' "${STASHES[@]}" \
+  | while read -r entry; do
+      REF="${entry%%:*}"
+      if git stash show "$REF" --name-only 2>/dev/null | grep -q '^mkdocs/mkdocs\.yml$'; then
+        echo "$REF"
+      fi
+    done)
+
+if [[ -n "$MKDOCS_STASHES" ]]; then
+  echo ""
+  echo -e "${RED}${BOLD}⚠ WARNING:${NC} the following stashes contain ${BOLD}mkdocs/mkdocs.yml${NC}:"
+  echo "$MKDOCS_STASHES" | sed 's/^/    /'
+  echo ""
+  echo "   These may hold tenant branding (site_name, site_url, custom theme, etc.)"
+  echo "   that ISN'T reflected on disk. Before dropping, verify:"
+  echo ""
+  echo "     git show <stash-ref>:mkdocs/mkdocs.yml | head -10"
+  echo "     diff <(git show <stash-ref>:mkdocs/mkdocs.yml) mkdocs/mkdocs.yml"
+  echo ""
+  echo "   If disk mkdocs.yml already has the tenant content, the stash is safe to drop."
+  echo "   If disk is upstream and stash has tenant content, restore first:"
+  echo "     git checkout <stash-ref> -- mkdocs/mkdocs.yml"
+  echo ""
+fi
+
+if [[ "$YES" != "true" ]]; then
+  echo -en "${BOLD}Drop all ${#STASHES[@]} upgrade-* stashes? [y/N] ${NC}"
+  read -r CONFIRM
+  case "$CONFIRM" in
+    y|Y|yes|YES) ;;
+    *) info "Cancelled. No stashes dropped."; exit 0 ;;
+  esac
+fi
+
+# Drop in reverse order so indices stay stable
+mapfile -t SORTED_REFS < <(printf '%s\n' "${STASHES[@]}" \
+  | sed 's/:.*//' \
+  | sort -t'{' -k2 -n -r)
+
+for REF in "${SORTED_REFS[@]}"; do
+  if git stash drop "$REF" >/dev/null 2>&1; then
+    ok "Dropped $REF"
+  else
+    warn "Failed to drop $REF (already gone?)"
+  fi
+done
+
+echo ""
+ok "Cleanup complete. Remaining stashes:"
+git stash list 2>/dev/null || echo "  (none)"
--- a/scripts/upgrade.sh
+++ b/scripts/upgrade.sh
@ -95,6 +95,14 @@ phase() {
  echo ""
 }

+# Pre-upgrade tenant docs snapshot (no-regrets fallback). Sourced regardless
+# of install mode so snapshot_mkdocs is available in Phase 2.
+# shellcheck source=lib/mkdocs-snapshot.sh
+if [[ -f "$SCRIPT_DIR/lib/mkdocs-snapshot.sh" ]]; then
+  # shellcheck disable=SC1091
+  . "$SCRIPT_DIR/lib/mkdocs-snapshot.sh"
+fi
+
 # --- API mode: JSON progress/result writing ---
 UPGRADE_DIR="${PROJECT_DIR}/data/upgrade"
 PROGRESS_FILE="${UPGRADE_DIR}/progress.json"
@ -709,6 +717,18 @@ fi
 phase "2" "Backup"
 write_progress 2 "Backup" 15 "Creating backup..."

+# Pre-upgrade tenant docs snapshot — the no-regrets fallback. Runs even when
+# --skip-backup is set, because this is for tenant content recovery (not DB
+# state) and is fast enough that skipping it would never be intentional. It
+# lives in the install root (not backups/) so operators discover it via `ls`.
+if declare -F snapshot_mkdocs >/dev/null 2>&1; then
+  if [[ "$DRY_RUN" == "true" ]]; then
+    info "[DRY RUN] Would snapshot mkdocs/ to ${PROJECT_DIR}/mkdocs-backup-*.tar.gz"
+  else
+    snapshot_mkdocs || warn "mkdocs snapshot failed (non-fatal; continuing)"
+  fi
+fi
+
 if [[ "$SKIP_BACKUP" == "true" ]]; then
  warn "Backup skipped (--skip-backup --force)"
 else
@ -1284,13 +1304,24 @@ while true; do
 done
 success "API healthy (${API_WAIT}s)"

-# Start everything else (exclude one-shot init containers)
+# Start everything else (exclude one-shot init containers AND the ccp-agent
+# service that's running this very script). Recreating ccp-agent here would
+# SIGKILL the script process before write_result has a chance to run; we
+# instead schedule a detached restart at the very end of the script.
+#
+# Mechanism: temporarily drop "ccp-agent" from COMPOSE_PROFILES so the broad
+# `up -d` doesn't include it. We re-add it only when scheduling the deferred
+# restart so the new agent comes up under its profile.
 info "Starting remaining services..."
+PROFILES_SAVED="${COMPOSE_PROFILES:-}"
+COMPOSE_PROFILES_WITHOUT_AGENT="$(echo "${PROFILES_SAVED}" \
+  | tr ',' '\n' | grep -vx 'ccp-agent' | paste -sd, -)"
+COMPOSE_PROFILES="${COMPOSE_PROFILES_WITHOUT_AGENT}" \
 docker compose up -d \
  --scale listmonk-init=0 \
  --scale gancio-init=0 \
  --scale vaultwarden-init=0
-success "All services started"
+success "All services started (ccp-agent restart deferred to end-of-script)"

 # Restart Pangolin tunnel connector if running (may hold stale state after nginx rebuild)
 if docker ps --format '{{.Names}}' | grep -q 'newt'; then
@ -1461,6 +1492,27 @@ echo -e "  ${BOLD}Duration:${NC}  $ELAPSED"
 echo -e "  ${BOLD}Log:${NC}       $LOG_FILE"
 echo ""

+# Deferred ccp-agent restart — the LAST thing the script does before exit.
+# This must run AFTER write_result and archive_success_to_history so the new
+# agent comes up to a complete result.json (otherwise CCP polls forever).
+# We launch a detached subshell that:
+#   1. Sleeps briefly so this script has time to exit cleanly first.
+#   2. Restarts ccp-agent under its profile, picking up any new image.
+# `nohup` + `disown` ensures the subshell survives the agent container dying
+# (when ccp-agent is recreated, the parent agent process — which spawned this
+# upgrade.sh — gets SIGKILL'd; the disowned subshell is reparented to PID 1
+# on the host and continues).
+if echo "${PROFILES_SAVED:-}" | tr ',' '\n' | grep -qx 'ccp-agent'; then
+  info "Scheduling deferred ccp-agent restart..."
+  nohup bash -c "
+    sleep 3
+    cd '$PROJECT_DIR'
+    COMPOSE_PROFILES='ccp-agent' docker compose --profile ccp-agent up -d ccp-agent
+  " >/dev/null 2>&1 < /dev/null &
+  disown
+  success "ccp-agent restart scheduled (will pick up new image)"
+fi
+
 release_lock
 trap - EXIT